def status(self, task_id, params, get_log=True): client = common.ssh_connect_with_retry( params['host'], params['port'], params['login'], pkey=self._config.get('pkey'), key_filename=self._config.get('key_filename') or self._config.get('privateKey'), login_cmd=params['login_cmd']) if 'container_id' in params: exit_status, stdout, stderr = common.run_docker_command( client, 'inspect -f {{.State.Status}} %s' % params['container_id']) else: exit_status, stdout, stderr = common.run_command( client, 'kill -0 -%d' % params['pgid']) if get_log: common.update_log(task_id, client, params['log_dir'], self._config.get('callback_url')) client.close() if exit_status != 0: return "dead" return "running"
def terminate(self, params): client = common.ssh_connect_with_retry( params['host'], params['port'], params['login'], pkey=self._config.get('pkey'), key_filename=self._config.get('key_filename') or self._config.get('privateKey'), login_cmd=params['login_cmd']) if 'container_id' in params: common.run_docker_command(client, 'rm --force %s' % params['container_id']) time.sleep(5) exit_status, stdout, stderr = common.run_command( client, 'kill -0 -%d' % params['pgid']) if exit_status != 0: logger.info("exist_status %d: %s", exit_status, stderr.read()) client.close() return exit_status, stdout, stderr = common.run_command( client, 'kill -9 -%d' % params['pgid']) if exit_status != 0: logger.info("exist_status %d: %s", exit_status, stderr.read()) client.close() return logger.info("successfully terminated") client.close()
def launch(self, task_id, options, gpulist, resource, docker_registry, docker_image, docker_tag, docker_command, docker_files, wait_after_launch): options['server'] = resource params = _get_params(self._config, options) client = common.ssh_connect_with_retry(params['server'], params['login'], self._config['privateKey'], login_cmd=params['login_cmd']) try: task = common.launch_task( task_id, client, gpulist, params['log_dir'], self._config['docker'], docker_registry, docker_image, docker_tag, docker_command, docker_files, wait_after_launch, self._config.get('storages'), self._config.get('callback_url'), self._config.get('callback_interval'), requirements=self._config.get("requirements")) finally: client.close() params['model'] = task['model'] params['pgid'] = task['pgid'] return params
def check(self, options): params = _get_params(self._config, options) client = paramiko.client.SSHClient() common.ssh_connect_with_retry( client, params['server'], params['login'], self._config['privateKey'], login_cmd=params['login_cmd']) try: details = common.check_environment( client, params['gpu'], params['log_dir'], self._config['docker']['registries']) finally: client.close() return details
def launch(self, task_id, options, resource, docker_registry, docker_image, docker_tag, docker_command, docker_files, wait_after_launch): ec2_client = self._session.client("ec2") response = _run_instance(ec2_client, resource) if response is None: raise RuntimeError("empty response from boto3.run_instances") if not response["Instances"]: raise RuntimeError("no instances were created") instance_id = response["Instances"][0]["InstanceId"] ec2 = self._session.resource("ec2") instance = ec2.Instance(instance_id) instance.wait_until_running() logger.info("Instance %s is running.", instance.id) key_path = os.path.join(self._config["privateKeysDirectory"], "%s.pem" % instance.key_pair.name) client = paramiko.SSHClient() try: common.ssh_connect_with_retry( client, instance.public_dns_name, self._config["amiUsername"], key_path, delay=self._config["sshConnectionDelay"], retry=self._config["maxSshConnectionRetry"]) common.fuse_s3_bucket(client, self._config["corpus"]) gpu_id = 1 if common.has_gpu_support(client) else 0 task = common.launch_task(task_id, client, gpu_id, self._config["logDir"], self._config["docker"], docker_registry, docker_image, docker_tag, docker_command, docker_files, wait_after_launch, self._config.get('storages'), self._config.get('callback_url'), self._config.get('callback_interval')) except Exception as e: if self._config.get("terminateOnError", True): instance.terminate() client.close() raise e finally: client.close() task["instance_id"] = instance.id return task
def _get_client(self, params): client = common.ssh_connect_with_retry( params['host'], params['port'], params['login'], pkey=self._config.get('pkey'), key_filename=self._config.get('key_filename') or self._config.get('privateKey'), login_cmd=params['login_cmd']) return client
def get_client(params, config): ssh_client = common.ssh_connect_with_retry( params['host'], params['port'], params['login'], pkey=config.get('pkey'), key_filename=config.get('key_filename') or config.get('privateKey'), delay=config["variables"]["sshConnectionDelay"], retry=config["variables"]["maxSshConnectionRetry"]) return ssh_client
def check(self, options): params = _get_params(self._config, options) client = paramiko.client.SSHClient() common.ssh_connect_with_retry(client, params['master_node'], params['login'], self._config['privateKey']) # check log_dir if not common.run_and_check_command( client, "test -d '%s'" % params['log_dir']): client.close() raise ValueError("incorrect log directory: %s" % params['log_dir']) status, stdout, _ = common.run_command( client, os.path.join(params['torque_install_path'], "qstat")) client.close() if status != 0: raise RuntimeError('qstat exited with code %s' % status) return "%s jobs(s) in the queue" % (len(stdout.read().split('\n')) - 2)
def check(self, options): params = _get_params(self._config, options) client = common.ssh_connect_with_retry( params['host'], params['port'], params['login'], pkey=self._config.get('pkey'), key_filename=self._config.get('key_filename') or self._config.get('privateKey'), login_cmd=params['login_cmd']) try: details = common.check_environment( client, params['gpus'], params['log_dir'], self._config['docker']['registries'], self._config.get('requirements'), False) finally: client.close() return details
def terminate(self, params): client = common.ssh_connect_with_retry( params['server'], params['login'], self._config['privateKey'], login_cmd=params['login_cmd']) if 'container_id' in params: common.run_docker_command(client, 'rm --force %s' % params['container_id']) else: exit_status, stdout, stderr = common.run_command(client, 'kill -0 -%d' % params['pgid']) if exit_status != 0: logger.debug("exist_status %d: %s", exit_status, stderr.read()) client.close() return exit_status, stdout, stderr = common.run_command(client, 'kill -9 -%d' % params['pgid']) if exit_status != 0: logger.debug("exist_status %d: %s", exit_status, stderr.read()) client.close() return logger.debug("successfully terminated") client.close()
def launch(self, task_id, options, xpulist, resource, docker_registry, docker_image, docker_tag, docker_command, docker_files, wait_after_launch, auth_token, support_statistics): options['server'] = resource params = _get_params(self._config, options) client = common.ssh_connect_with_retry( params['host'], params['port'], params['login'], pkey=self._config.get('pkey'), key_filename=self._config.get('key_filename') or self._config.get('privateKey'), login_cmd=params['login_cmd']) try: callback_url = self._config.get('callback_url') if auth_token: callback_url = callback_url.replace("://", "://" + auth_token + ":x@") task = common.launch_task( task_id, client, xpulist, params['log_dir'], self._config['docker'], docker_registry, docker_image, docker_tag, docker_command, docker_files, wait_after_launch, self._config.get('storages'), callback_url, self._config.get('callback_interval'), requirements=self._config.get("requirements"), support_statistics=support_statistics) finally: client.close() params['model'] = task['model'] params['pgid'] = task['pgid'] return params
def launch(self, task_id, options, gpulist, resource, storages, docker_config, docker_registry, docker_image, docker_tag, docker_command, docker_files, wait_after_launch): params = _get_params(self._config, options) client = common.ssh_connect_with_retry(params['master_node'], params['login'], self._config['privateKey']) cmd = "cat <<-'EOF'\n" cmd += "#!/bin/bash\n" cmd += "#PBS -l nodes=1:ppn=2:gpus=1,mem=%sG,walltime=10000:00:00\n" % params[ 'mem'] cmd += "#PBS -p %d\n" % params['priority'] cmd += "#PBS -N infTraining\n" cmd += "#PBS -o %s/%s.log -j oe\n" % (params['log_dir'], task_id) cmd += "guessdevice(){\n" cmd += " if [ -e \"${PBS_GPUFILE}\" ]\n" cmd += " then\n" cmd += " GPUS=`cat ${PBS_GPUFILE} | perl -pe 's/[^-]+-gpu//g' |" cmd += " perl -pe 's/\s+/ /g' | perl -pe 's/,$//g'`\n" cmd += " GPUS=`echo \"${GPUS}+1\" | bc `\n" cmd += " echo $GPUS;\n" cmd += " else\n" cmd += " echo \"error: No available GPU\"\n" cmd += " fi\n" cmd += "}\n" cmd += "DEVICE=$(guessdevice)\n" cmd += "echo \"RUN ON GPU ${DEVICE}\"\n" registry = docker_config['registries'][docker_registry] registry_uri = registry['uri'] registry_urip = '' if registry_uri == '' else registry_uri + '/' image_ref = '%s%s:%s' % (registry_urip, docker_image, docker_tag) if registry['type'] != 'dockerhub': cmd_connect = common.cmd_connect_private_registry(registry) cmd += "echo '=> " + cmd_connect + "'\n" cmd += cmd_connect + '\n' cmd_docker_pull = common.cmd_docker_pull(image_ref) cmd += "echo '=> " + cmd_docker_pull + "'\n" cmd += cmd_docker_pull + '\n' docker_cmd = "echo | " + common.cmd_docker_run( "$DEVICE", docker_config, task_id, image_ref, storages, self._config['callback_url'], self._config['callback_interval'], docker_command) cmd += "echo \"=> " + docker_cmd.replace("\"", "\"") + "\"\n" cmd += docker_cmd + '\n' if self._config['callback_url']: callback_cmd = '' if params['log_dir'] is not None and params['log_dir'] != '': callback_cmd = 'curl -X POST "%s/log/%s" --data-binary "@%s/%s.log" ; ' % ( self._config['callback_url'], task_id, params['log_dir'], task_id) callback_cmd += 'curl "%s/terminate/%s?phase=completed"' % ( self._config['callback_url'], task_id) cmd += "echo \"=> " + callback_cmd.replace("\"", "\\\"") + "\"\n" cmd += callback_cmd + '\n' cmd += "EOF\n" qsub_cmd = "echo \"$(%s)\" | %s" % ( cmd, os.path.join(params['torque_install_path'], "qsub -V")) exit_status, stdout, stderr = common.run_command(client, qsub_cmd) if exit_status != 0: client.close() raise RuntimeError('run exited with code %d: %s' % (exit_status, stderr.read())) client.close() params['model'] = task_id params['qsub_id'] = stdout.read().strip() return params
def launch( self, task_id, options, xpulist, resource, storages, # pylint: disable=unused-argument docker_config, docker_registry, docker_image, docker_tag, docker_command, docker_files, wait_after_launch, auth_token, support_statistics): options['server'] = resource params = _get_params(self._templates, options) ec2_client = self._session.client("ec2") try: response = _run_instance( ec2_client, params["name"], task_id=task_id, instance_init_limit=self._config["variables"].get( "instanceInitLimit")) except ClientError as error: raise EnvironmentError('Create instance failed: %s' % error) from error if response is None: raise RuntimeError("empty response from boto3.run_instances") if not response["Instances"]: raise RuntimeError("no instances were created") instance_id = response["Instances"][0]["InstanceId"] ec2 = self._session.resource("ec2") instance = ec2.Instance(instance_id) instance.wait_until_running() logger.info("EC2 - Instance %s is running.", instance.id) client = paramiko.SSHClient() try: client = common.ssh_connect_with_retry( instance.public_dns_name, 22, params['login'], pkey=self._config.get('pkey'), key_filename=self._config.get('key_filename') or self._config.get('privateKey'), delay=self._config["variables"]["sshConnectionDelay"], retry=self._config["variables"]["maxSshConnectionRetry"]) # mounting corpus and temporary model directories corpus_dir = self._config["corpus"] if not isinstance(corpus_dir, list): corpus_dir = [corpus_dir] for corpus_description in corpus_dir: common.fuse_s3_bucket(client, corpus_description) if self._config["variables"].get("temporary_model_storage"): common.fuse_s3_bucket( client, self._config["variables"]["temporary_model_storage"]) callback_url = self._config.get('callback_url') if auth_token: callback_url = callback_url.replace("://", "://" + auth_token + ":x@") task = common.launch_task(task_id, client, (xpulist[0], None), params, docker_config, docker_registry, docker_image, docker_tag, docker_command, docker_files, wait_after_launch, self._config.get('storages'), callback_url, self._config.get('callback_interval'), support_statistics=support_statistics) except Exception as e: if self._config["variables"].get("terminateOnError", True): instance.terminate() logger.info("Terminated instance (on launch error): %s.", instance_id) client.close() raise e finally: client.close() global num_of_inits if num_of_inits.get(task_id): num_of_inits.pop(task_id) task["instance_id"] = instance.id return task