예제 #1
0
    def _runjob(self):
        """Runs a job"""
        #check for valid user/group
        try:
            userid, groupid = pwd.getpwnam(self.user)[2:4]
        except KeyError:
            raise ProcessGroupCreationError("error getting uid/gid")

        try:
            os.setgid(groupid)
            os.setuid(userid)
        except OSError:
            logger.error(
                "failed to change userid/groupid for process group %s" %
                (self.id))
            os._exit(1)

        if self.umask != None:
            try:
                os.umask(self.umask)
            except:
                logger.error("Failed to set umask to %s" % self.umask)
        try:
            partition = self.location[0]
        except IndexError:
            raise ProcessGroupCreationError("no location")

        kerneloptions = self.kerneloptions

        # export subset of MPIRUN_* variables to mpirun's environment
        # we explicitly state the ones we want since some are "dangerous"
        exportenv = [
            'MPIRUN_CONNECTION', 'MPIRUN_KERNEL_OPTIONS', 'MPIRUN_MAPFILE',
            'MPIRUN_START_GDBSERVER', 'MPIRUN_LABEL', 'MPIRUN_NW',
            'MPIRUN_VERBOSE', 'MPIRUN_ENABLE_TTY_REPORTING', 'MPIRUN_STRACE'
        ]
        app_envs = []
        for key, value in self.env.iteritems():
            if key in exportenv:
                os.environ[key] = value
            else:
                app_envs.append((key, value))

        envs = " ".join(["%s=%s" % x for x in app_envs])
        atexit._atexit = []

        try:
            stdin = open(self.stdin, 'r')
        except (IOError, OSError, TypeError), e:
            logger.error(
                "process group %s: error opening stdin file %s: %s (stdin will be /dev/null)"
                % (self.id, self.stdin, e))
            stdin = open("/dev/null", 'r')
예제 #2
0
    def add_process_groups(self, specs):
        """Create a process group.
        
        Arguments:
        spec -- dictionary hash specifying a process group to start
        """

        self.logger.info("add_process_groups(%r)" % (specs))

        script_specs = []
        other_specs = []
        for spec in specs:
            if spec.get('mode', False) == "script":
                script_specs.append(spec)
            else:
                other_specs.append(spec)

        # start up script jobs
        script_pgroups = []
        if script_specs:
            for spec in script_specs:
                try:
                    self._set_kernel(
                        spec.get('location')[0], spec.get('kernel', "default"))
                except Exception, e:
                    new_pgroup = self.process_groups.q_add([spec])
                    pgroup = new_pgroup[0]
                    pgroup.nodect = self._partitions[pgroup.location[0]].size
                    pgroup.exit_status = 1
                    self.logger.info(
                        "process group %s: job %s/%s failed to set the kernel; %s",
                        pgroup.id, pgroup.jobid, pgroup.user, e)
                else:
                    try:
                        script_pgroup = ComponentProxy(
                            "script-manager").add_jobs([spec])
                    except (ComponentLookupError, xmlrpclib.Fault):
                        self._clear_kernel(spec.get('location')[0])
                        # FIXME: jobs that were already started are not reported
                        raise ProcessGroupCreationError(
                            "system::add_process_groups failed to communicate with script-manager"
                        )
                    new_pgroup = self.process_groups.q_add([spec])
                    pgroup = new_pgroup[0]
                    pgroup.script_id = script_pgroup[0]['id']
                    pgroup.nodect = self._partitions[pgroup.location[0]].size
                    self.logger.info(
                        "job %s/%s: process group %s created to track script",
                        pgroup.jobid, pgroup.user, pgroup.id)
                    self.reserve_resources_until(
                        spec['location'],
                        time.time() + 60 * float(spec['walltime']),
                        pgroup.jobid)
                    if pgroup.kernel != "default":
                        self.logger.info(
                            "process group %s: job %s/%s using kernel %s",
                            pgroup.id, pgroup.jobid, pgroup.user,
                            pgroup.kernel)
                    script_pgroups.append(pgroup)
예제 #3
0
    def prefork (self):
        ret = {}
        
        # check for valid user/group
        try:
            userid, groupid = pwd.getpwnam(self.user)[2:4]
        except KeyError:
            raise ProcessGroupCreationError("error getting uid/gid")

        ret["userid"] = userid
        ret["primary_group"] = groupid
        
        self.nodefile = "/var/tmp/cobalt.%s" % self.jobid
        
        # get supplementary groups
        supplementary_group_ids = []
        for g in grp.getgrall():
            if self.user in g.gr_mem:
                supplementary_group_ids.append(g.gr_gid)
        
        ret["other_groups"] = supplementary_group_ids
        
        ret["umask"] = self.umask
        
        try:
            rank0 = self.location[0].split(":")[0]
        except IndexError:
            raise ProcessGroupCreationError("no location")

        kerneloptions = self.kerneloptions

        ret["postfork_env"] = self.env
        ret["stdin"] = self.stdin
        ret["stdout"] = self.stdout
        ret["stderr"] = self.stderr
        
        cmd_string = "/usr/bin/cobalt-launcher.py --nf %s --jobid %s --cwd %s --exe %s" % (self.nodefile, self.jobid, self.cwd, self.executable)
        cmd = ("/usr/bin/ssh", "/usr/bin/ssh", rank0, cmd_string)

        
        ret["id"] = self.id
        ret["jobid"] = self.jobid
        ret["cobalt_log_file"] = self.cobalt_log_file
        ret["cmd" ] = cmd

        return ret
예제 #4
0
    def prefork (self):
        ret = {}
        ret = ProcessGroup.prefork(self)

        sim_mode  = get_orcm_system_config("simulation_mode", 'false').lower() in config_true_values
        if not sim_mode:
            nodefile_dir = get_orcm_system_config("nodefile_dir", "/var/tmp")
            self.nodefile = os.path.join(nodefile_dir, "cobalt.%s" % self.jobid)
        else:
            self.nodefile = "fake"

        try:
            #This is the head node, return this to the user.
            rank0 = self.location[0].split(":")[0]
        except IndexError:
            raise ProcessGroupCreationError("no location")

        split_args = self.args
        cmd_args = ['--nf', str(self.nodefile),
                    '--jobid', str(self.jobid),
                    '--cwd', str(self.cwd),]

        qsub_env_list = ["%s=%s" % (key, val) for key, val in self.env.iteritems()]
        for env in qsub_env_list:
            cmd_args.extend(['--env', env])
        cmd_args.append(self.executable)

        cmd_exe = None
        if sim_mode:
            logger.debug("We are setting up with simulation mode.")
            cmd_exe = get_cluster_system_config("simulation_executable", None)
            if None == cmd_exe:
                logger.critical("Job: %s/%s: Executable for simulator not specified! This job will not run!")
                raise RuntimeError("Unspecified simulation_executable in cobalt config")
        else:
            cmd_exe = get_orcm_system_config('launcher','/usr/bin/cobalt-launcher.py')

        #run the user script off the login node, and on the compute node
        if (get_orcm_system_config("run_remote", 'true').lower() in config_true_values and not sim_mode):
            cmd = ("/usr/bin/ssh", rank0, cmd_exe, ) + tuple(cmd_args) + tuple(split_args)
        else:
            cmd = (cmd_exe,) + tuple(cmd_args) + tuple(split_args)

        ret["cmd" ] = cmd
        ret["args"] = cmd[1:]
        ret["executable"] = cmd[0]
        self.executable = ret["executable"]
        self.cmd = ret["cmd"]
        self.args = list(ret["args"])

        return ret
예제 #5
0
    def _get_argv(self, config_files=None):
        """Get a command string for a process group for a process group."""
        if config_files is None:
            config_files = Cobalt.CONFIG_FILES
        config = ConfigParser()
        config.read(config_files)

        argv = [
            config.get("bgpm", "mpirun"),
            os.path.basename(config.get("bgpm", "mpirun")),
        ]

        if self.true_mpi_args is not None:
            # arguments have been passed along in a special attribute.  These arguments have
            # already been modified to include the partition that cobalt has selected
            # for the process group.
            argv.extend(self.true_mpi_args)
            return argv

        argv.extend([
            "-np",
            str(self.size),
            "-mode",
            self.mode,
            "-cwd",
            self.cwd,
            "-exe",
            self.executable,
        ])

        try:
            partition = self.location[0]
        except (KeyError, IndexError):
            raise ProcessGroupCreationError("location")
        argv.extend(["-partition", partition])

        if self.kerneloptions:
            argv.extend(['-kernel_options', self.kerneloptions])

        if self.args:
            argv.extend(["-args", " ".join(self.args)])

        if self.env:
            env_kvstring = " ".join([
                "%s=%s" % (key, value) for key, value in self.env.iteritems()
            ])
            argv.extend(["-env", env_kvstring])

        return argv
예제 #6
0
    def prefork(self):
        ret = {}

        sim_mode = get_cluster_system_config(
            "simulation_mode", 'false').lower() in config_true_values
        if not sim_mode:
            nodefile_dir = get_cluster_system_config("nodefile_dir",
                                                     "/var/tmp")
            self.nodefile = os.path.join(nodefile_dir,
                                         "cobalt.%s" % self.jobid)
        else:
            self.nodefile = "fake"

        try:
            rank0 = self.location[0].split(":")[0]
        except IndexError:
            raise ProcessGroupCreationError("no location")

        split_args = self.args.split()
        cmd_args = ('--nf', str(self.nodefile), '--jobid', str(self.jobid),
                    '--cwd', str(self.cwd), '--exe', str(self.executable))

        cmd_exe = None
        if sim_mode:
            cmd_exe = get_cluster_system_config("simulation_executable", None)
            if None == cmd_exe:
                logger.critical(
                    "Job: %s/%s: Executable for simulator not specified! This job will not run!"
                )
                raise RuntimeError(
                    "Unspecified simulation_executable in cobalt config")
        else:
            #FIXME: Need to put launcher location into config
            cmd_exe = '/usr/bin/cobalt-launcher.py'

        #run the user script off the login node, and on the compute node
        if (get_cluster_system_config("run_remote", 'true').lower()
                in config_true_values and not sim_mode):
            cmd = ("/usr/bin/ssh", rank0,
                   cmd_exe) + cmd_args + tuple(split_args)
        else:
            cmd = (cmd_exe, ) + cmd_args + tuple(split_args)

        ret["cmd"] = cmd
        ret["args"] = split_args

        return ret
예제 #7
0
    def _get_argv(self, config_files=None):
        """Get a command string for a process group for a process group."""
        if config_files is None:
            config_files = Cobalt.CONFIG_FILES
        config = ConfigParser()
        config.read(config_files)

        argv = [
            config.get("bgpm", "mpirun"),
            os.path.basename(config.get("bgpm", "mpirun")),
        ]

        argv.extend([
            "-np",
            str(self.size),
            "-mode",
            self.mode,
            "-cwd",
            self.cwd,
            "-exe",
            self.executable,
        ])

        try:
            partition = self.location[0]
        except (KeyError, IndexError):
            raise ProcessGroupCreationError("location")
        argv.extend(["-partition", partition])

        if self.kerneloptions:
            argv.extend(['-kernel_options', self.kerneloptions])

        if self.args:
            argv.extend(["-args", " ".join(self.args)])

        if self.env:
            env_kvstring = " ".join([
                "%s=%s" % (key, value) for key, value in self.env.iteritems()
            ])
            argv.extend(["-env", env_kvstring])

        return argv
예제 #8
0
    def add_process_groups(self, specs):
        """Create a simulated process group.
        
        Arguments:
        spec -- dictionary hash specifying a process group to start
        """

        self.logger.info("add_process_groups(%r)" % (specs))

        script_specs = []
        other_specs = []
        for spec in specs:
            if spec.get('mode') == "script":
                script_specs.append(spec)
            else:
                other_specs.append(spec)

        # start up script jobs
        new_pgroups = []
        if script_specs:
            try:
                for spec in script_specs:
                    script_pgroup = ComponentProxy("script-manager").add_jobs(
                        [spec])
                    new_pgroup = self.process_groups.q_add([spec])
                    new_pgroup[0].script_id = script_pgroup[0]['id']
                    self.reserve_resources_until(
                        spec['location'],
                        time.time() + 60 * float(spec['walltime']),
                        new_pgroup[0].jobid)
                    new_pgroups.append(new_pgroup[0])
            except (ComponentLookupError, xmlrpclib.Fault):
                raise ProcessGroupCreationError(
                    "system::add_process_groups failed to communicate with script-manager"
                )

        process_groups = self.process_groups.q_add(other_specs)
        for process_group in process_groups:
            self.start(process_group)

        return new_pgroups + process_groups
예제 #9
0
    def prefork(self):
        ret = {}

        # check for valid user/group
        try:
            userid, groupid = pwd.getpwnam(self.user)[2:4]
        except KeyError:
            raise ProcessGroupCreationError("error getting uid/gid")

        ret["userid"] = userid
        ret["primary_group"] = groupid

        # get supplementary groups
        supplementary_group_ids = []
        for g in grp.getgrall():
            if self.user in g.gr_mem:
                supplementary_group_ids.append(g.gr_gid)

        ret["other_groups"] = supplementary_group_ids

        ret["umask"] = self.umask

        try:
            partition = self.location[0]
        except IndexError:
            raise ProcessGroupCreationError("no location")

        kerneloptions = self.kerneloptions

        # export subset of MPIRUN_* variables to mpirun's environment
        # we explicitly state the ones we want since some are "dangerous"
        exportenv = [
            'MPIRUN_CONNECTION', 'MPIRUN_KERNEL_OPTIONS', 'MPIRUN_MAPFILE',
            'MPIRUN_START_GDBSERVER', 'MPIRUN_LABEL', 'MPIRUN_NW',
            'MPIRUN_VERBOSE', 'MPIRUN_ENABLE_TTY_REPORTING', 'MPIRUN_STRACE'
        ]
        postfork_env = {}
        app_envs = []
        for key, value in self.env.iteritems():
            if key in exportenv:
                postfork_env[key] = value
            else:
                app_envs.append((key, value))

        envs = " ".join(["%s=%s" % x for x in app_envs])

        ret["postfork_env"] = postfork_env
        ret["stdin"] = self.stdin
        ret["stdout"] = self.stdout
        ret["stderr"] = self.stderr

        cmd = (self.config['mpirun'], os.path.basename(self.config['mpirun']),
               '-host', self.config['mmcs_server_ip'], '-np', str(self.size),
               '-partition', partition, '-mode', self.mode, '-cwd', self.cwd,
               '-exe', self.executable)
        if self.args:
            cmd = cmd + ('-args', self.args)
        if envs:
            cmd = cmd + ('-env', envs)
        if kerneloptions:
            cmd = cmd + ('-kernel_options', kerneloptions)

        # If this mpirun command originated from a user script, its arguments
        # have been passed along in a special attribute.  These arguments have
        # already been modified to include the partition that cobalt has selected
        # for the job, and can just replace the arguments built above.
        if self.true_mpi_args:
            cmd = (self.config['mpirun'],
                   os.path.basename(self.config['mpirun'])) + tuple(
                       self.true_mpi_args)

        ret["id"] = self.id
        ret["jobid"] = self.jobid
        ret["cobalt_log_file"] = self.cobalt_log_file
        ret["cmd"] = cmd

        return ret