Esempio n. 1
0
    def status(self, task_id, params, get_log=True):
        client = common.ssh_connect_with_retry(
            params['host'],
            params['port'],
            params['login'],
            pkey=self._config.get('pkey'),
            key_filename=self._config.get('key_filename')
            or self._config.get('privateKey'),
            login_cmd=params['login_cmd'])

        if 'container_id' in params:
            exit_status, stdout, stderr = common.run_docker_command(
                client,
                'inspect -f {{.State.Status}} %s' % params['container_id'])
        else:
            exit_status, stdout, stderr = common.run_command(
                client, 'kill -0 -%d' % params['pgid'])

        if get_log:
            common.update_log(task_id, client, params['log_dir'],
                              self._config.get('callback_url'))

        client.close()
        if exit_status != 0:
            return "dead"

        return "running"
Esempio n. 2
0
 def terminate(self, params):
     client = common.ssh_connect_with_retry(
         params['host'],
         params['port'],
         params['login'],
         pkey=self._config.get('pkey'),
         key_filename=self._config.get('key_filename')
         or self._config.get('privateKey'),
         login_cmd=params['login_cmd'])
     if 'container_id' in params:
         common.run_docker_command(client,
                                   'rm --force %s' % params['container_id'])
         time.sleep(5)
     exit_status, stdout, stderr = common.run_command(
         client, 'kill -0 -%d' % params['pgid'])
     if exit_status != 0:
         logger.info("exist_status %d: %s", exit_status, stderr.read())
         client.close()
         return
     exit_status, stdout, stderr = common.run_command(
         client, 'kill -9 -%d' % params['pgid'])
     if exit_status != 0:
         logger.info("exist_status %d: %s", exit_status, stderr.read())
         client.close()
         return
     logger.info("successfully terminated")
     client.close()
Esempio n. 3
0
 def launch(self, task_id, options, gpulist, resource, docker_registry,
            docker_image, docker_tag, docker_command, docker_files,
            wait_after_launch):
     options['server'] = resource
     params = _get_params(self._config, options)
     client = common.ssh_connect_with_retry(params['server'],
                                            params['login'],
                                            self._config['privateKey'],
                                            login_cmd=params['login_cmd'])
     try:
         task = common.launch_task(
             task_id,
             client,
             gpulist,
             params['log_dir'],
             self._config['docker'],
             docker_registry,
             docker_image,
             docker_tag,
             docker_command,
             docker_files,
             wait_after_launch,
             self._config.get('storages'),
             self._config.get('callback_url'),
             self._config.get('callback_interval'),
             requirements=self._config.get("requirements"))
     finally:
         client.close()
     params['model'] = task['model']
     params['pgid'] = task['pgid']
     return params
Esempio n. 4
0
 def check(self, options):
     params = _get_params(self._config, options)
     client = paramiko.client.SSHClient()
     common.ssh_connect_with_retry(
         client,
         params['server'],
         params['login'],
         self._config['privateKey'],
         login_cmd=params['login_cmd'])
     try:
         details = common.check_environment(
             client,
             params['gpu'],
             params['log_dir'],
             self._config['docker']['registries'])
     finally:
         client.close()
     return details
Esempio n. 5
0
    def launch(self, task_id, options, resource, docker_registry, docker_image,
               docker_tag, docker_command, docker_files, wait_after_launch):
        ec2_client = self._session.client("ec2")
        response = _run_instance(ec2_client, resource)
        if response is None:
            raise RuntimeError("empty response from boto3.run_instances")
        if not response["Instances"]:
            raise RuntimeError("no instances were created")
        instance_id = response["Instances"][0]["InstanceId"]
        ec2 = self._session.resource("ec2")
        instance = ec2.Instance(instance_id)
        instance.wait_until_running()
        logger.info("Instance %s is running.", instance.id)

        key_path = os.path.join(self._config["privateKeysDirectory"],
                                "%s.pem" % instance.key_pair.name)
        client = paramiko.SSHClient()
        try:
            common.ssh_connect_with_retry(
                client,
                instance.public_dns_name,
                self._config["amiUsername"],
                key_path,
                delay=self._config["sshConnectionDelay"],
                retry=self._config["maxSshConnectionRetry"])
            common.fuse_s3_bucket(client, self._config["corpus"])
            gpu_id = 1 if common.has_gpu_support(client) else 0
            task = common.launch_task(task_id, client, gpu_id,
                                      self._config["logDir"],
                                      self._config["docker"], docker_registry,
                                      docker_image, docker_tag, docker_command,
                                      docker_files, wait_after_launch,
                                      self._config.get('storages'),
                                      self._config.get('callback_url'),
                                      self._config.get('callback_interval'))
        except Exception as e:
            if self._config.get("terminateOnError", True):
                instance.terminate()
            client.close()
            raise e
        finally:
            client.close()
        task["instance_id"] = instance.id
        return task
Esempio n. 6
0
 def _get_client(self, params):
     client = common.ssh_connect_with_retry(
         params['host'],
         params['port'],
         params['login'],
         pkey=self._config.get('pkey'),
         key_filename=self._config.get('key_filename')
         or self._config.get('privateKey'),
         login_cmd=params['login_cmd'])
     return client
Esempio n. 7
0
def get_client(params, config):
    ssh_client = common.ssh_connect_with_retry(
        params['host'],
        params['port'],
        params['login'],
        pkey=config.get('pkey'),
        key_filename=config.get('key_filename') or config.get('privateKey'),
        delay=config["variables"]["sshConnectionDelay"],
        retry=config["variables"]["maxSshConnectionRetry"])
    return ssh_client
Esempio n. 8
0
    def check(self, options):
        params = _get_params(self._config, options)
        client = paramiko.client.SSHClient()

        common.ssh_connect_with_retry(client, params['master_node'],
                                      params['login'],
                                      self._config['privateKey'])

        # check log_dir
        if not common.run_and_check_command(
                client, "test -d '%s'" % params['log_dir']):
            client.close()
            raise ValueError("incorrect log directory: %s" % params['log_dir'])

        status, stdout, _ = common.run_command(
            client, os.path.join(params['torque_install_path'], "qstat"))

        client.close()
        if status != 0:
            raise RuntimeError('qstat exited with code %s' % status)
        return "%s jobs(s) in the queue" % (len(stdout.read().split('\n')) - 2)
Esempio n. 9
0
 def check(self, options):
     params = _get_params(self._config, options)
     client = common.ssh_connect_with_retry(
         params['host'],
         params['port'],
         params['login'],
         pkey=self._config.get('pkey'),
         key_filename=self._config.get('key_filename')
         or self._config.get('privateKey'),
         login_cmd=params['login_cmd'])
     try:
         details = common.check_environment(
             client, params['gpus'], params['log_dir'],
             self._config['docker']['registries'],
             self._config.get('requirements'), False)
     finally:
         client.close()
     return details
Esempio n. 10
0
 def terminate(self, params):
     client = common.ssh_connect_with_retry(
         params['server'],
         params['login'],
         self._config['privateKey'],
         login_cmd=params['login_cmd'])
     if 'container_id' in params:
         common.run_docker_command(client, 'rm --force %s' % params['container_id'])
     else:
         exit_status, stdout, stderr = common.run_command(client, 'kill -0 -%d' % params['pgid'])
         if exit_status != 0:
             logger.debug("exist_status %d: %s", exit_status, stderr.read())
             client.close()
             return
         exit_status, stdout, stderr = common.run_command(client, 'kill -9 -%d' % params['pgid'])
         if exit_status != 0:
             logger.debug("exist_status %d: %s", exit_status, stderr.read())
             client.close()
             return
     logger.debug("successfully terminated")
     client.close()
Esempio n. 11
0
 def launch(self, task_id, options, xpulist, resource, docker_registry,
            docker_image, docker_tag, docker_command, docker_files,
            wait_after_launch, auth_token, support_statistics):
     options['server'] = resource
     params = _get_params(self._config, options)
     client = common.ssh_connect_with_retry(
         params['host'],
         params['port'],
         params['login'],
         pkey=self._config.get('pkey'),
         key_filename=self._config.get('key_filename')
         or self._config.get('privateKey'),
         login_cmd=params['login_cmd'])
     try:
         callback_url = self._config.get('callback_url')
         if auth_token:
             callback_url = callback_url.replace("://",
                                                 "://" + auth_token + ":x@")
         task = common.launch_task(
             task_id,
             client,
             xpulist,
             params['log_dir'],
             self._config['docker'],
             docker_registry,
             docker_image,
             docker_tag,
             docker_command,
             docker_files,
             wait_after_launch,
             self._config.get('storages'),
             callback_url,
             self._config.get('callback_interval'),
             requirements=self._config.get("requirements"),
             support_statistics=support_statistics)
     finally:
         client.close()
     params['model'] = task['model']
     params['pgid'] = task['pgid']
     return params
Esempio n. 12
0
    def launch(self, task_id, options, gpulist, resource, storages,
               docker_config, docker_registry, docker_image, docker_tag,
               docker_command, docker_files, wait_after_launch):
        params = _get_params(self._config, options)

        client = common.ssh_connect_with_retry(params['master_node'],
                                               params['login'],
                                               self._config['privateKey'])

        cmd = "cat <<-'EOF'\n"
        cmd += "#!/bin/bash\n"
        cmd += "#PBS -l nodes=1:ppn=2:gpus=1,mem=%sG,walltime=10000:00:00\n" % params[
            'mem']
        cmd += "#PBS -p %d\n" % params['priority']
        cmd += "#PBS -N infTraining\n"
        cmd += "#PBS -o %s/%s.log -j oe\n" % (params['log_dir'], task_id)

        cmd += "guessdevice(){\n"
        cmd += "    if [ -e \"${PBS_GPUFILE}\" ]\n"
        cmd += "    then\n"
        cmd += "        GPUS=`cat ${PBS_GPUFILE} | perl -pe 's/[^-]+-gpu//g' |"
        cmd += " perl -pe 's/\s+/ /g' | perl -pe 's/,$//g'`\n"
        cmd += "        GPUS=`echo \"${GPUS}+1\" | bc `\n"
        cmd += "        echo $GPUS;\n"
        cmd += "    else\n"
        cmd += "        echo \"error: No available GPU\"\n"
        cmd += "    fi\n"
        cmd += "}\n"

        cmd += "DEVICE=$(guessdevice)\n"
        cmd += "echo \"RUN ON GPU ${DEVICE}\"\n"
        registry = docker_config['registries'][docker_registry]
        registry_uri = registry['uri']
        registry_urip = '' if registry_uri == '' else registry_uri + '/'
        image_ref = '%s%s:%s' % (registry_urip, docker_image, docker_tag)

        if registry['type'] != 'dockerhub':
            cmd_connect = common.cmd_connect_private_registry(registry)
            cmd += "echo '=> " + cmd_connect + "'\n"
            cmd += cmd_connect + '\n'

        cmd_docker_pull = common.cmd_docker_pull(image_ref)
        cmd += "echo '=> " + cmd_docker_pull + "'\n"
        cmd += cmd_docker_pull + '\n'
        docker_cmd = "echo | " + common.cmd_docker_run(
            "$DEVICE", docker_config, task_id, image_ref, storages,
            self._config['callback_url'], self._config['callback_interval'],
            docker_command)

        cmd += "echo \"=> " + docker_cmd.replace("\"", "\"") + "\"\n"
        cmd += docker_cmd + '\n'

        if self._config['callback_url']:
            callback_cmd = ''
            if params['log_dir'] is not None and params['log_dir'] != '':
                callback_cmd = 'curl -X POST "%s/log/%s" --data-binary "@%s/%s.log" ; ' % (
                    self._config['callback_url'], task_id, params['log_dir'],
                    task_id)

            callback_cmd += 'curl "%s/terminate/%s?phase=completed"' % (
                self._config['callback_url'], task_id)
            cmd += "echo \"=> " + callback_cmd.replace("\"", "\\\"") + "\"\n"
            cmd += callback_cmd + '\n'

        cmd += "EOF\n"

        qsub_cmd = "echo \"$(%s)\" | %s" % (
            cmd, os.path.join(params['torque_install_path'], "qsub -V"))

        exit_status, stdout, stderr = common.run_command(client, qsub_cmd)
        if exit_status != 0:
            client.close()
            raise RuntimeError('run exited with code %d: %s' %
                               (exit_status, stderr.read()))

        client.close()
        params['model'] = task_id
        params['qsub_id'] = stdout.read().strip()
        return params
Esempio n. 13
0
    def launch(
            self,
            task_id,
            options,
            xpulist,
            resource,
            storages,  # pylint: disable=unused-argument
            docker_config,
            docker_registry,
            docker_image,
            docker_tag,
            docker_command,
            docker_files,
            wait_after_launch,
            auth_token,
            support_statistics):
        options['server'] = resource
        params = _get_params(self._templates, options)
        ec2_client = self._session.client("ec2")
        try:
            response = _run_instance(
                ec2_client,
                params["name"],
                task_id=task_id,
                instance_init_limit=self._config["variables"].get(
                    "instanceInitLimit"))
        except ClientError as error:
            raise EnvironmentError('Create instance failed: %s' %
                                   error) from error
        if response is None:
            raise RuntimeError("empty response from boto3.run_instances")
        if not response["Instances"]:
            raise RuntimeError("no instances were created")
        instance_id = response["Instances"][0]["InstanceId"]
        ec2 = self._session.resource("ec2")
        instance = ec2.Instance(instance_id)
        instance.wait_until_running()
        logger.info("EC2 - Instance %s is running.", instance.id)

        client = paramiko.SSHClient()
        try:
            client = common.ssh_connect_with_retry(
                instance.public_dns_name,
                22,
                params['login'],
                pkey=self._config.get('pkey'),
                key_filename=self._config.get('key_filename')
                or self._config.get('privateKey'),
                delay=self._config["variables"]["sshConnectionDelay"],
                retry=self._config["variables"]["maxSshConnectionRetry"])

            # mounting corpus and temporary model directories
            corpus_dir = self._config["corpus"]
            if not isinstance(corpus_dir, list):
                corpus_dir = [corpus_dir]
            for corpus_description in corpus_dir:
                common.fuse_s3_bucket(client, corpus_description)
            if self._config["variables"].get("temporary_model_storage"):
                common.fuse_s3_bucket(
                    client,
                    self._config["variables"]["temporary_model_storage"])

            callback_url = self._config.get('callback_url')
            if auth_token:
                callback_url = callback_url.replace("://",
                                                    "://" + auth_token + ":x@")
            task = common.launch_task(task_id,
                                      client, (xpulist[0], None),
                                      params,
                                      docker_config,
                                      docker_registry,
                                      docker_image,
                                      docker_tag,
                                      docker_command,
                                      docker_files,
                                      wait_after_launch,
                                      self._config.get('storages'),
                                      callback_url,
                                      self._config.get('callback_interval'),
                                      support_statistics=support_statistics)
        except Exception as e:
            if self._config["variables"].get("terminateOnError", True):
                instance.terminate()
                logger.info("Terminated instance (on launch error): %s.",
                            instance_id)
            client.close()
            raise e
        finally:
            client.close()
            global num_of_inits
            if num_of_inits.get(task_id):
                num_of_inits.pop(task_id)
        task["instance_id"] = instance.id
        return task