Example #1
0
    def run(self):

        argv_keepalive = [
            ffi.new("char[]", "submit"), # Will be stripped off by the library
            ffi.new("char[]", "--hnp"), ffi.new("char[]", DVM_URI),
            ffi.NULL, # Required
        ]
        argv = ffi.new("char *[]", argv_keepalive)
        lib.orte_submit_init(3, argv, ffi.NULL)

        index = ffi.new("int *")

        for i in range(3):

            argv_keepalive = [
                ffi.new("char[]", "RADICAL-Pilot"),
                ffi.new("char[]", "--np"), ffi.new("char[]", "1"),
                ffi.new("char[]", "false"),
                ffi.NULL, # Required
            ]
            argv = ffi.new("char *[]", argv_keepalive)
            lib.orte_submit_job(argv, index, lib.launch_cb, ffi.NULL, lib.finish_cb, ffi.NULL)
            task = index[0]
            task_instance_map[task] = self
            self.mywait += 1
            self.myspawn += 1
            print "Task %d submitted!" % task

        while self.myspawn > 0 or self.mywait > 0:
            time.sleep(0.1)

        print("Done!")
Example #2
0
    def run(self):

        argv_keepalive = [
            ffi.new("char[]", "submit"),  # Will be stripped off by the library
            ffi.new("char[]", "--hnp"),
            ffi.new("char[]", DVM_URI),
            ffi.NULL,  # Required
        ]
        argv = ffi.new("char *[]", argv_keepalive)
        lib.orte_submit_init(3, argv, ffi.NULL)

        index = ffi.new("int *")

        for i in range(3):

            argv_keepalive = [
                ffi.new("char[]", "RADICAL-Pilot"),
                ffi.new("char[]", "--np"),
                ffi.new("char[]", "1"),
                ffi.new("char[]", "false"),
                ffi.NULL,  # Required
            ]
            argv = ffi.new("char *[]", argv_keepalive)
            lib.orte_submit_job(argv, index, lib.launch_cb, ffi.NULL,
                                lib.finish_cb, ffi.NULL)
            task = index[0]
            task_instance_map[task] = self
            self.mywait += 1
            self.myspawn += 1
            print "Task %d submitted!" % task

        while self.myspawn > 0 or self.mywait > 0:
            time.sleep(0.1)

        print("Done!")
Example #3
0
    def run(self):

        argv_keepalive = [
            ffi.new("char[]", "RADICAL-Pilot"), # Will be stripped off by the library
            ffi.new("char[]", "--hnp"), ffi.new("char[]", DVM_URI),
            ffi.NULL, # Required
        ]
        argv = ffi.new("char *[]", argv_keepalive)
        lib.orte_submit_init(3, argv, ffi.NULL)

        index = ffi.new("int[1]")

        for i in range(3):

            argv_keepalive = [
                ffi.new("char[]", "RADICAL-Pilot"),
                ffi.new("char[]", "--np"), ffi.new("char[]", "1"),
                ffi.new("char[]", "bash"), ffi.new("char[]", "-c"),
                #ffi.new("char[]", "t=%d; echo $t; touch TOUCHME; sleep $t; exit 0" % 10),
                ffi.new("char[]", "sleep %d" % 10),
                ffi.NULL, # Required
            ]
            argv = ffi.new("char *[]", argv_keepalive)
            rc = lib.orte_submit_job(argv, index, lib.launch_cb, ffi.NULL, lib.finish_cb, ffi.NULL)
            task = index[0]
            task_instance_map[task] = self
            self.mywait += 1
            self.myspawn += 1
            print "Task %d submitted!" % task


        while self.myspawn > 0 or self.mywait > 0:

            global fourislaunched
            if fourislaunched:
                print "Cancelling task ..."
                lib.orte_submit_cancel(1)
                fourislaunched = False

            time.sleep(0.1)

        print("Done!")

        lib.orte_submit_halt()
Example #4
0
    def spawn(self, launcher, cu):

        # NOTE: see documentation of cu['sandbox'] semantics in the ComputeUnit
        #       class definition.
        sandbox = '%s/%s' % (self._pwd, cu['uid'])

        if False:
            cu_tmpdir = '%s/%s' % (self.tmpdir, cu['uid'])
        else:
            cu_tmpdir = sandbox

        rec_makedir(cu_tmpdir)

        # TODO: pre_exec
        # # Before the Big Bang there was nothing
        # if cu['description']['pre_exec']:
        #     fail = ' (echo "pre_exec failed"; false) || exit'
        #     pre  = ''
        #     for elem in cu['description']['pre_exec']:
        #         pre += "%s || %s\n" % (elem, fail)
        #     # Note: extra spaces below are for visual alignment
        #     launch_script.write("# Pre-exec commands\n")
        #     if 'RADICAL_PILOT_PROFILE' in os.environ:
        #         launch_script.write("echo cu_pre_start `%s` >> %s/%s.prof\n"\
        #                           % (cu['gtod'], cu_tmpdir, cu['uid']))
        #     launch_script.write(pre)
        #     if 'RADICAL_PILOT_PROFILE' in os.environ:
        #         launch_script.write("echo cu_pre_stop `%s` >> %s/%s.prof\n" \
        #                           % (cu['gtod'], cu_tmpdir, cu['uid']))

        # TODO: post_exec
        # # After the universe dies the infrared death, there will be nothing
        # if cu['description']['post_exec']:
        #     fail = ' (echo "post_exec failed"; false) || exit'
        #     post = ''
        #     for elem in cu['description']['post_exec']:
        #         post += "%s || %s\n" % (elem, fail)
        #     launch_script.write("# Post-exec commands\n")
        #     if 'RADICAL_PILOT_PROFILE' in os.environ:
        #         launch_script.write("echo cu_post_start `%s` >> %s/%s.prof\n" \
        #                           % (cu['gtod'], cu_tmpdir, cu['uid']))
        #     launch_script.write('%s\n' % post)
        #     if 'RADICAL_PILOT_PROFILE' in os.environ:
        #         launch_script.write("echo cu_post_stop  `%s` >> %s/%s.prof\n" \
        #                           % (cu['gtod'], cu_tmpdir, cu['uid']))


        # The actual command line, constructed per launch-method
        try:
            orte_command, task_command = launcher.construct_command(cu, None)
        except Exception as e:
            msg = "Error in spawner (%s)" % e
            self._log.exception(msg)
            raise RuntimeError(msg)

        # Construct arguments to submit_job
        arg_list = []

        # Take the orte specific commands and split them
        for arg in orte_command.split():
            arg_list.append(ffi.new("char[]", str(arg)))

        # Set the working directory
        arg_list.append(ffi.new("char[]", "--wdir"))
        arg_list.append(ffi.new("char[]", str(cu_tmpdir)))

        # Set RP environment variables
        rp_envs = [
            "RP_SESSION_ID=%s" % self._cfg['session_id'],
            "RP_PILOT_ID=%s"   % self._cfg['pilot_id'],
            "RP_AGENT_ID=%s"   % self._cfg['agent_name'],
            "RP_SPAWNER_ID=%s" % self.uid,
            "RP_UNIT_ID=%s"    % cu['uid'],
            "RP_PILOT_STAGING=%s/staging_area" % self._pwd
        ]
        for env in rp_envs:
            arg_list.append(ffi.new("char[]", "-x"))
            arg_list.append(ffi.new("char[]", str(env)))

        # Set pre-populated environment variables
        if self._cu_environment:
            for key,val in self._cu_environment.iteritems():
                arg_list.append(ffi.new("char[]", "-x"))
                arg_list.append(ffi.new("char[]", "%s=%s" % (key, val)))

        # Set environment variables specified for this CU
        if cu['description']['environment']:
            for key,val in cu['description']['environment'].iteritems():
                arg_list.append(ffi.new("char[]", "-x"))
                arg_list.append(ffi.new("char[]", "%s=%s" % (key, val)))

        # Let the orted write stdout and stderr to rank-based output files
        arg_list.append(ffi.new("char[]", "--output-filename"))
        arg_list.append(ffi.new("char[]", "%s:nojobid,nocopy" % str(cu_tmpdir)))

        # Save retval of actual CU application (in case we have post-exec)
        task_command += "; RETVAL=$?"

        # Wrap in (sub)shell for output redirection
        arg_list.append(ffi.new("char[]", "sh"))
        arg_list.append(ffi.new("char[]", "-c"))
        if 'RADICAL_PILOT_PROFILE' in os.environ:
            task_command = "echo script cu_start `%s` >> %s/%s.prof; " \
                         % (self.gtod, cu_tmpdir, cu['uid']) \
                         + "echo script cu_cd_done `%s` >> %s/%s.prof; " \
                         % (self.gtod, cu_tmpdir, cu['uid']) \
                         + "echo script cu_exec_start `%s` >> %s/%s.prof; " \
                         % (self.gtod, cu_tmpdir, cu['uid']) \
                         + task_command \
                         + "; echo script cu_exec_stop `%s` >> %s/%s.prof" \
                         % (self.gtod, cu_tmpdir, cu['uid'])
        arg_list.append(ffi.new("char[]", str("%s; exit $RETVAL" \
                                            % str(task_command))))

        self._log.debug("Launching unit %s via %s %s", cu['uid'], 
                        orte_command, task_command)

        # NULL termination, required by ORTE
        arg_list.append(ffi.NULL)
        argv = ffi.new("char *[]", arg_list)

        # stdout/stderr filenames can't be set with orte
        # TODO: assert here or earlier?
        # assert cu['description'].get('stdout') == None
        # assert cu['description'].get('stderr') == None

        # prepare stdout/stderr
        # TODO: when mpi==True && cores>1 there will be multiple files that need
        #       to be concatenated.
        cu['stdout_file'] = os.path.join(cu_tmpdir, 'rank.0/stdout')
        cu['stderr_file'] = os.path.join(cu_tmpdir, 'rank.0/stderr')

        # Submit to the DVM!
        index = ffi.new("int *")
        with self.task_map_lock:

            self._prof.prof('exec_start', uid=cu['uid'])
            rc = orte_lib.orte_submit_job(argv, index, orte_lib.launch_cb, 
                                          self._myhandle, orte_lib.finish_cb, 
                                          self._myhandle)
            if rc:
                raise Exception("submit job failed with error: %d" % rc)

            self.task_map[index[0]] = cu      # map ORTE index to CU
            self._prof.prof('exec_ok', uid=cu['uid'])

        self._log.debug("Task %d submitted!", cu['uid'])
Example #5
0
    def run(self, ):

        argv_keepalive = [
            ffi.new("char[]", "RADICAL-Pilot"), # Will be stripped off by the library
            ffi.new("char[]", "--hnp"),
            ffi.new("char[]", DVM_URI),
            ffi.NULL, # Required
        ]
        argv = ffi.new("char *[]", argv_keepalive)
        lib.orte_submit_init(3, argv, ffi.NULL)

        # Used for storing the task id that is returned by orte_submit_job
        index_ptr = ffi.new("int[1]")

        task_no = 1
        while task_no <= TASKS or self.active > 0:

            if task_no <= TASKS and self.active < CORES:

                task_id = 'unit.%.6d' % task_no
                cu_tmpdir = '%s' % task_id

                #
                # ASIC
                #
                self.session.prof.prof(event='get', state=AGENT_STAGING_INPUT_PENDING, uid=task_id, name='AgentStagingInputComponent')
                self.session.prof.prof(event='work start', state=AGENT_STAGING_INPUT_PENDING, uid=task_id, name='AgentStagingInputComponent')
                self.session.prof.prof('advance', uid=task_id, state=AGENT_STAGING_INPUT, name='AgentStagingInputComponent')
                os.mkdir('%s' % cu_tmpdir)
                self.session.prof.prof('advance', uid=task_id, state=ALLOCATING_PENDING, name='AgentStagingInputComponent')
                self.session.prof.prof(event='work done', state=AGENT_STAGING_INPUT_PENDING, uid=task_id, name='AgentStagingInputComponent')
                self.session.prof.prof(event='put', state=ALLOCATING_PENDING, uid=task_id, name='AgentStagingInputComponent')

                #
                # ASC
                #
                self.session.prof.prof(event='get', state=ALLOCATING_PENDING, uid=task_id, name='AgentSchedulingComponent')
                self.session.prof.prof(event='work start', state=ALLOCATING_PENDING, uid=task_id, name='AgentSchedulingComponent')
                self.session.prof.prof('advance', uid=task_id, state=ALLOCATING, name='AgentSchedulingComponent')
                self.session.prof.prof('schedule', msg='try', uid=task_id, name='AgentSchedulingComponent')
                self.session.prof.prof('schedule', msg='allocated', uid=task_id, name='AgentSchedulingComponent')
                self.session.prof.prof('advance', uid=task_id, state=EXECUTING_PENDING, name='AgentSchedulingComponent')
                self.session.prof.prof(event='put', state=EXECUTING_PENDING, uid=task_id, name='AgentSchedulingComponent')
                self.session.prof.prof(event='work done', state=ALLOCATING_PENDING, uid=task_id, name='AgentSchedulingComponent')

                #
                # AEC
                #

                self.session.prof.prof(event='get', state=EXECUTING_PENDING, uid=task_id, name='AgentExecutingComponent')
                self.session.prof.prof(event='work start', state=EXECUTING_PENDING, uid=task_id, name='AgentExecutingComponent')
                self.session.prof.prof('exec', msg='unit launch', uid=task_id, name='AgentExecutingComponent')
                self.session.prof.prof('spawn', msg='unit spawn', uid=task_id, name='AgentExecutingComponent')

                argv_keepalive = [
                    ffi.new("char[]", "RADICAL-Pilot"),
                    ffi.new("char[]", "--np"), ffi.new("char[]", "1"),
                ]

                # Let the orted write stdout and stderr to rank-based output files
                argv_keepalive.append(ffi.new("char[]", "--output-filename"))
                argv_keepalive.append(ffi.new("char[]", "%s:nojobid,nocopy" % str(cu_tmpdir)))

                argv_keepalive.append(ffi.new("char[]", "sh"))
                argv_keepalive.append(ffi.new("char[]", "-c"))

                task_command = 'sleep %d' % SLEEP

                # Wrap in (sub)shell for output redirection
                task_command = "echo script start_script `%s` >> %s/PROF; " % (GTOD, cu_tmpdir) + \
                      task_command + \
                      "; echo script after_exec `%s` >> %s/PROF" % (GTOD, cu_tmpdir)
                argv_keepalive.append(ffi.new("char[]", str("%s; exit $RETVAL" % str(task_command))))

                argv_keepalive.append(ffi.NULL) # NULL Termination Required
                argv = ffi.new("char *[]", argv_keepalive)

                self.session.prof.prof('command', msg='launch command constructed', uid=task_id, name='AgentExecutingComponent')

                struct = {'instance': self, 'task': task_id}
                cbdata = ffi.new_handle(struct)

                lib.orte_submit_job(argv, index_ptr, lib.launch_cb, cbdata, lib.finish_cb, cbdata)

                index = index_ptr[0] # pointer notation
                self.task_instance_map[index] = cbdata

                self.session.prof.prof('spawn', msg='spawning passed to orte', uid=task_id, name='AgentExecutingComponent')
                self.session.prof.prof(event='work done', state=EXECUTING_PENDING, uid=task_id, name='AgentExecutingComponent')

                print "Task %s submitted!" % task_id

                self.active += 1
                task_no += 1

            else:
                time.sleep(0.001)

        print("Execution done.")
        print()
        print("Collecting profiles ...")
        for task_no in range(TASKS):
            task_id = 'unit.%.6d' % task_no
            self.session.prof.prof('advance', uid=task_id, state=AGENT_STAGING_OUTPUT, name='AgentStagingOutputComponent')
            cu_tmpdir = '%s' % task_id
            if os.path.isfile("%s/PROF" % cu_tmpdir):
                try:
                    with open("%s/PROF" % cu_tmpdir, 'r') as prof_f:
                        txt = prof_f.read()
                        for line in txt.split("\n"):
                            if line:
                                x1, x2, x3 = line.split()
                                self.session.prof.prof(x1, msg=x2, timestamp=float(x3), uid=task_id, name='AgentStagingOutputComponent')
                except Exception as e:
                    print("Pre/Post profiling file read failed: `%s`" % e)
Example #6
0
    def spawn(self, launcher, cu):

        sandbox = cu['unit_sandbox_path']

        if False:
            cu_tmpdir = '%s/%s' % (self.tmpdir, cu['uid'])
        else:
            cu_tmpdir = sandbox

        rec_makedir(cu_tmpdir)

        # TODO: pre_exec
        # # Before the Big Bang there was nothing
        # if cu['description']['pre_exec']:
        #     fail = ' (echo "pre_exec failed"; false) || exit'
        #     pre  = ''
        #     for elem in cu['description']['pre_exec']:
        #         pre += "%s || %s\n" % (elem, fail)
        #     # Note: extra spaces below are for visual alignment
        #     launch_script.write("# Pre-exec commands\n")
        #     if self._prof.enabled:
        #         launch_script.write("echo cu_pre_start `%s` >> %s/%s.prof\n"\
        #                           % (cu['gtod'], sandbox, cu['uid']))
        #     launch_script.write(pre)
        #     if self._prof.enabled:
        #         launch_script.write("echo cu_pre_stop `%s` >> %s/%s.prof\n" \
        #                           % (cu['gtod'], sandbox, cu['uid']))

        # TODO: post_exec
        # # After the universe dies the infrared death, there will be nothing
        # if cu['description']['post_exec']:
        #     fail = ' (echo "post_exec failed"; false) || exit'
        #     post = ''
        #     for elem in cu['description']['post_exec']:
        #         post += "%s || %s\n" % (elem, fail)
        #     launch_script.write("# Post-exec commands\n")
        #     if self._prof.enabled:
        #         launch_script.write("echo cu_post_start `%s` >> %s/%s.prof\n"
        #                           % (cu['gtod'], sandbox, cu['uid']))
        #     launch_script.write('%s\n' % post)
        #     if self._prof.enabled:
        #         launch_script.write("echo cu_post_stop  `%s` >> %s/%s.prof\n"
        #                           % (cu['gtod'], sandbox, cu['uid']))


        # The actual command line, constructed per launch-method
        try:
            orte_command, task_command = launcher.construct_command(cu, None)
        except Exception as e:
            msg = "Error in spawner (%s)" % e
            self._log.exception(msg)
            raise RuntimeError(msg)

        # Construct arguments to submit_job
        arg_list = []

        # Take the orte specific commands and split them
        for arg in orte_command.split():
            arg_list.append(ffi.new("char[]", str(arg)))

        # Set the working directory
        arg_list.append(ffi.new("char[]", "--wdir"))
        arg_list.append(ffi.new("char[]", str(sandbox)))

        # Set RP environment variables
        rp_envs = [
            "RP_SESSION_ID=%s" % self._cfg['sid'],
            "RP_PILOT_ID=%s"   % self._cfg['pid'],
            "RP_AGENT_ID=%s"   % self._cfg['aid'],
            "RP_SPAWNER_ID=%s" % self.uid,
            "RP_UNIT_ID=%s"    % cu['uid'],
            "RP_UNIT_NAME=%s"  % cu['description'].get('name'),
            "RP_PILOT_STAGING=%s/staging_area" % self._pwd
        ]
        for env in rp_envs:
            arg_list.append(ffi.new("char[]", "-x"))
            arg_list.append(ffi.new("char[]", str(env)))

        # Set pre-populated environment variables
        if self._cu_environment:
            for key,val in self._cu_environment.items():
                arg_list.append(ffi.new("char[]", "-x"))
                arg_list.append(ffi.new("char[]", "%s=%s" % (key, val)))

        # Set environment variables specified for this CU
        if cu['description']['environment']:
            for key,val in cu['description']['environment'].items():
                arg_list.append(ffi.new("char[]", "-x"))
                arg_list.append(ffi.new("char[]", "%s=%s" % (key, val)))

        # Let the orted write stdout and stderr to rank-based output files
        arg_list.append(ffi.new("char[]", "--output-filename"))
        arg_list.append(ffi.new("char[]", "%s:nojobid,nocopy" % str(sandbox)))

        # Save retval of actual CU application (in case we have post-exec)
        task_command += "; RETVAL=$?"

        # Wrap in (sub)shell for output redirection
        arg_list.append(ffi.new("char[]", "sh"))
        arg_list.append(ffi.new("char[]", "-c"))
        if self._prof.enabled:
            task_command = "echo script cu_start `%s` >> %s/%s.prof; " \
                         % (self.gtod, sandbox, cu['uid']) \
                         + "echo script cu_exec_start `%s` >> %s/%s.prof; " \
                         % (self.gtod, sandbox, cu['uid']) \
                         + task_command \
                         + "; echo script cu_exec_stop `%s` >> %s/%s.prof" \
                         % (self.gtod, sandbox, cu['uid'])
        arg_list.append(ffi.new("char[]", str("%s; exit $RETVAL"
                                            % str(task_command))))

        self._log.debug("Launching unit %s via %s %s", cu['uid'],
                        orte_command, task_command)

        # NULL termination, required by ORTE
        arg_list.append(ffi.NULL)
        argv = ffi.new("char *[]", arg_list)

        # stdout/stderr filenames can't be set with orte
        # TODO: assert here or earlier?
        # assert cu['description'].get('stdout') == None
        # assert cu['description'].get('stderr') == None

        # prepare stdout/stderr
        # TODO: when mpi==True && cores>1 there will be multiple files that need
        #       to be concatenated.
        cu['stdout_file'] = os.path.join(sandbox, 'rank.0/stdout')
        cu['stderr_file'] = os.path.join(sandbox, 'rank.0/stderr')

        # Submit to the DVM!
        index = ffi.new("int *")
        with self.task_map_lock:

            self._prof.prof('exec_start', uid=cu['uid'])
            rc = orte_lib.orte_submit_job(argv, index, orte_lib.launch_cb,
                                          self._myhandle, orte_lib.finish_cb,
                                          self._myhandle)
            if rc:
                raise Exception("submit job failed with error: %d" % rc)

            self.task_map[index[0]] = cu      # map ORTE index to CU
            self._prof.prof('exec_ok', uid=cu['uid'])

        self._log.debug("Task %d submitted!", cu['uid'])