def _schedule_jobs(self, jobs, max_per_node = None):
        """
        Schedule a series of compute jobs. Blocks until completion.

        :param jobs: iterable of :class:`~lofarpipe.support.remotecommand.ComputeJob` to be scheduled
        :param max_per_node: maximum number of simultaneous jobs on any given node
        :type max_per_node: integer or none
        :rtype: dict mapping integer job id to :class:`~lofarpipe.support.remotecommand.ComputeJob`
        """
        threadpool = []
        jobpool = {}
        if not max_per_node and self.config.has_option('remote', 'max_per_node'):
            max_per_node = self.config.getint('remote', 'max_per_node')
        limiter = ProcessLimiter(max_per_node)
        killswitch = threading.Event()

        if max_per_node:
            self.logger.info("Limiting to %d simultaneous jobs/node" % max_per_node)

        # External cluster stuff
        try:
            method = self.config.get('remote', 'method')
        except:
            method = None

        redistribute_hosts = False
        # JURECA SLURM
        if method == 'slurm_srun':
            nodeliststr = []
            hargs = ['srun','hostname']
            proc = Popen(hargs, False, stdout=PIPE, stderr=None)
            tup = proc.communicate()
            nodeliststr = tup[0].rstrip('\n').split('\n')
            # remove duplicates. order not important
            nodeliststr = list(set(nodeliststr))
            # set flag to re-distribute the hosts for the jobs
            redistribute_hosts = True

        # Hertfordshire cluster
        elif method == 'pbs_ssh':
            # special case - get the list of nodes from the pbs job
            nodeliststr = []
            try:
                filename = os.environ['PBS_NODEFILE']
            except KeyError:
                self.logger.error('PBS_NODEFILE not found.')
                raise PipelineQuit()
            with open(filename, 'r') as file:
                for line in file:
                    node_name = line.split()[0]
                    if node_name not in nodeliststr:
                        nodeliststr.append(node_name)
            # set flag to re-distribute the hosts for the jobs
            redistribute_hosts = True

        # get hostlist from slurm, but start jobs via ssh
        elif method == 'slurm_ssh':
            try:
                hostlist = os.environ['SLURM_JOB_NODELIST']
            except KeyError:
                self.logger.error('SLURM_JOB_NODELIST not found. You must have a slurm reservation!')
                raise PipelineQuit()
            nodeliststr = expand_slurm_hostlist(hostlist)
            # set flag to re-distribute the hosts for the jobs
            redistribute_hosts = True

        # generic case, node-names in an env-variable
        elif method == 'ssh_generic':
            nodeliststr = []
            try:
                env_name = self.config.get('remote', 'nodelist_variable')
            except:
                env_name = 'PIPELINE_NODES'

            try:
                nodes = os.environ[env_name]
            except KeyError:
                self.logger.error('Env-variable \"'+env_name+'\" not found.')
                raise PipelineQuit()
            nodeliststr = [node.strip() for node in nodes.strip('[] ').split(',')]
            # remove duplicates. order not important
            nodeliststr = list(set(nodeliststr))
            # set flag to re-distribute the hosts for the jobs
            redistribute_hosts = True

        # re-distribute the hosts if requested
        if redistribute_hosts:
            # equal distribution
            total = len(jobs)
            # when nodes crash? length of slurm_nodelist and env slurm_nnodes dont match anymore
            nnodes = len(nodeliststr)
            # round robin
            nodelist = []
            for i in range(total):
                nodelist.append(nodeliststr[i%nnodes])

            for i, job in enumerate(jobs):
                job.host = nodelist[i]

        with job_server(self.logger, jobpool, self.error) as (jobhost, jobport):
            self.logger.debug("Job dispatcher at %s:%d" % (jobhost, jobport))
            for job_id, job in enumerate(jobs):
                jobpool[job_id] = job
                threadpool.append(
                    threading.Thread(
                        target = job.dispatch,
                        args = (
                            self.logger, self.config, limiter, job_id,
                            jobhost, jobport, self.error, killswitch
                        )
                    )
                )
            threadwatcher(threadpool, self.logger, killswitch)

        if killswitch.isSet():
            raise PipelineQuit()

        # Add information regarding specific nodes to an xml node.
        self.logger.debug("Adding node_logging_information")
        local_document = xml.Document()
        node_durations = local_document.createElement("nodes")
        for job_id, job in enumerate(jobs):
            # Test if the duration is there
            # fixme the name of node_durations is not logical
            if "job_duration" in job.results:
                child_node_duration = add_child(node_durations, "job")
                child_node_duration.setAttribute("job_id", str(job_id))
                child_node_duration.setAttribute("job_host", str(job.host))
                child_node_duration.setAttribute("duration",
                     str(job.results["job_duration"]))

                # return code if present (Not there on error)
                if "returncode" in job.results:
                    child_node_duration.setAttribute(
                        "returncode", str(job.results['returncode']))
                else:
                    child_node_duration.setAttribute(
                        "returncode", str(-1))

                ## If there is 'node level' resource logging available
                if "monitor_stats" in job.results:
                      return_node = xml.parseString(
                          job.results['monitor_stats']).documentElement

                      child_node_duration.appendChild(return_node)


        # manually add the result xml as an ingredient output.
        # this allows backward compatible logging: If not read an additional
        # output does not matter
        self.outputs._fields["return_xml"] = ingredient.StringField(
                                                help = "XML return data.")
        self.outputs["return_xml"] = node_durations.toxml(encoding = "ascii")

        return jobpool
    def _schedule_jobs(self, jobs, max_per_node = None):
        """
        Schedule a series of compute jobs. Blocks until completion.

        :param jobs: iterable of :class:`~lofarpipe.support.remotecommand.ComputeJob` to be scheduled
        :param max_per_node: maximum number of simultaneous jobs on any given node
        :type max_per_node: integer or none
        :rtype: dict mapping integer job id to :class:`~lofarpipe.support.remotecommand.ComputeJob`
        """
        threadpool = []
        jobpool = {}
        if not max_per_node and self.config.has_option('remote', 'max_per_node'):
            max_per_node = self.config.getint('remote', 'max_per_node')
        limiter = ProcessLimiter(max_per_node)
        killswitch = threading.Event()

        if max_per_node:
            self.logger.info("Limiting to %d simultaneous jobs/node" % max_per_node)

        with job_server(self.logger, jobpool, self.error) as (jobhost, jobport):
            self.logger.debug("Job dispatcher at %s:%d" % (jobhost, jobport))
            for job_id, job in enumerate(jobs):
                jobpool[job_id] = job
                threadpool.append(
                    threading.Thread(
                        target = job.dispatch,
                        args = (
                            self.logger, self.config, limiter, job_id,
                            jobhost, jobport, self.error, killswitch
                        )
                    )
                )
            threadwatcher(threadpool, self.logger, killswitch)

        if killswitch.isSet():
            raise PipelineQuit()

        # Add information regarding specific nodes to an xml node.
        self.logger.debug("Adding node_logging_information")
        local_document = xml.Document()
        node_durations = local_document.createElement("nodes")
        for job_id, job in enumerate(jobs):
            # Test if the duration is there
            if "job_duration" in job.results:
                child_node_duration = add_child(node_durations, "job")
                child_node_duration.setAttribute("job_id", str(job_id))
                child_node_duration.setAttribute("duration",
                     str(job.results["job_duration"]))
                # return code if present (Not there on error
                if "returncode" in job.results:
                    child_node_duration.setAttribute(
                        "returncode", str(job.results['returncode']))
                else:
                    child_node_duration.setAttribute(
                        "returncode", str(-1))


        # manually add the result xml as an ingredient output.
        # this allows backward compatible logging: If not read an additional
        # output does not matter
        self.outputs._fields["return_xml"] = ingredient.StringField(
                                                help = "XML return data.")
        self.outputs["return_xml"] = node_durations.toxml(encoding = "ascii")

        return jobpool
Exemple #3
0
    def go(self):
        self.logger.info("Starting BBS run")
        super(bbs, self).go()

        #             Generate source and parameter databases for all input data
        # ----------------------------------------------------------------------
        inputs = LOFARinput(self.inputs)
        inputs['args'] = self.inputs['args']
        inputs['executable'] = self.inputs['parmdbm']
        inputs['working_directory'] = self.config.get(
            "DEFAULT", "default_working_directory")
        inputs['mapfile'] = self.task_definitions.get('parmdb', 'mapfile')
        inputs['suffix'] = ".instrument"
        outputs = LOFARoutput(self.inputs)
        if self.cook_recipe('parmdb', inputs, outputs):
            self.logger.warn("parmdb reports failure")
            return 1
        inputs['args'] = self.inputs['args']
        inputs['executable'] = self.inputs['makesourcedb']
        inputs['skymodel'] = self.inputs['skymodel']
        inputs['mapfile'] = self.task_definitions.get('sourcedb', 'mapfile')
        inputs['suffix'] = ".sky"
        outputs = LOFARoutput(self.inputs)
        if self.cook_recipe('sourcedb', inputs, outputs):
            self.logger.warn("sourcedb reports failure")
            return 1

        #              Build a GVDS file describing all the data to be processed
        # ----------------------------------------------------------------------
        self.logger.debug("Building VDS file describing all data for BBS")
        vds_file = os.path.join(self.config.get("layout", "job_directory"),
                                "vds", "bbs.gvds")
        inputs = LOFARinput(self.inputs)
        inputs['args'] = self.inputs['args']
        inputs['gvds'] = vds_file
        inputs['unlink'] = False
        inputs['makevds'] = self.inputs['makevds']
        inputs['combinevds'] = self.inputs['combinevds']
        inputs['nproc'] = self.inputs['nproc']
        inputs['directory'] = os.path.dirname(vds_file)
        outputs = LOFARoutput(self.inputs)
        if self.cook_recipe('vdsmaker', inputs, outputs):
            self.logger.warn("vdsmaker reports failure")
            return 1
        self.logger.debug("BBS GVDS is %s" % (vds_file, ))

        #      Iterate over groups of subbands divided up for convenient cluster
        #          procesing -- ie, no more than nproc subbands per compute node
        # ----------------------------------------------------------------------
        for to_process in gvds_iterator(vds_file, int(self.inputs["nproc"])):
            #               to_process is a list of (host, filename, vds) tuples
            # ------------------------------------------------------------------
            hosts, ms_names, vds_files = map(list, zip(*to_process))

            #             The BBS session database should be cleared for our key
            # ------------------------------------------------------------------
            self.logger.debug("Cleaning BBS database for key %s" %
                              (self.inputs["key"]))
            with closing(
                    psycopg2.connect(
                        host=self.inputs["db_host"],
                        user=self.inputs["db_user"],
                        database=self.inputs["db_name"])) as db_connection:
                db_connection.set_isolation_level(
                    psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
                with closing(db_connection.cursor()) as db_cursor:
                    db_cursor.execute(
                        "DELETE FROM blackboard.session WHERE key=%s",
                        (self.inputs["key"], ))

            #     BBS GlobalControl requires a GVDS file describing all the data
            #          to be processed. We assemble that from the separate parts
            #                                         already available on disk.
            # ------------------------------------------------------------------
            self.logger.debug("Building VDS file describing data for BBS run")
            vds_dir = tempfile.mkdtemp(suffix=".%s" %
                                       (os.path.basename(__file__), ))
            vds_file = os.path.join(vds_dir, "bbs.gvds")
            combineproc = utilities.spawn_process([
                self.inputs['combinevds'],
                vds_file,
            ] + vds_files, self.logger)
            sout, serr = combineproc.communicate()
            log_process_output(self.inputs['combinevds'], sout, serr,
                               self.logger)
            if combineproc.returncode != 0:
                raise subprocess.CalledProcessError(combineproc.returncode,
                                                    command)

            #      Construct a parset for BBS GlobalControl by patching the GVDS
            #           file and database information into the supplied template
            # ------------------------------------------------------------------
            self.logger.debug("Building parset for BBS control")
            bbs_parset = utilities.patch_parset(
                self.inputs['parset'],
                {
                    'Observation': vds_file,
                    'BBDB.Key': self.inputs['key'],
                    'BBDB.Name': self.inputs['db_name'],
                    'BBDB.User': self.inputs['db_user'],
                    'BBDB.Host': self.inputs['db_host'],
                    #                'BBDB.Port': self.inputs['db_name'],
                })
            self.logger.debug("BBS control parset is %s" % (bbs_parset, ))

            try:
                #        When one of our processes fails, we set the killswitch.
                #      Everything else will then come crashing down, rather than
                #                                         hanging about forever.
                # --------------------------------------------------------------
                self.killswitch = threading.Event()
                self.killswitch.clear()
                signal.signal(signal.SIGTERM, self.killswitch.set)

                #                           GlobalControl runs in its own thread
                # --------------------------------------------------------------
                run_flag = threading.Event()
                run_flag.clear()
                bbs_control = threading.Thread(target=self._run_bbs_control,
                                               args=(bbs_parset, run_flag))
                bbs_control.start()
                run_flag.wait()  # Wait for control to start before proceeding

                #      We run BBS KernelControl on each compute node by directly
                #                             invoking the node script using SSH
                #      Note that we use a job_server to send out job details and
                #           collect logging information, so we define a bunch of
                #    ComputeJobs. However, we need more control than the generic
                #     ComputeJob.dispatch method supplies, so we'll control them
                #                                          with our own threads.
                # --------------------------------------------------------------
                command = "python %s" % (self.__file__.replace(
                    'master', 'nodes'))
                env = {
                    "LOFARROOT":
                    utilities.read_initscript(
                        self.logger, self.inputs['initscript'])["LOFARROOT"],
                    "PYTHONPATH":
                    self.config.get('deploy', 'engine_ppath'),
                    "LD_LIBRARY_PATH":
                    self.config.get('deploy', 'engine_lpath')
                }
                jobpool = {}
                bbs_kernels = []
                with job_server(self.logger, jobpool,
                                self.error) as (jobhost, jobport):
                    self.logger.debug("Job server at %s:%d" %
                                      (jobhost, jobport))
                    for job_id, details in enumerate(to_process):
                        host, file, vds = details
                        jobpool[job_id] = ComputeJob(
                            host,
                            command,
                            arguments=[
                                self.inputs['kernel_exec'],
                                self.inputs['initscript'], file,
                                self.inputs['key'], self.inputs['db_name'],
                                self.inputs['db_user'], self.inputs['db_host']
                            ])
                        bbs_kernels.append(
                            threading.Thread(target=self._run_bbs_kernel,
                                             args=(host, command, env, job_id,
                                                   jobhost, str(jobport))))
                    self.logger.info("Starting %d threads" % len(bbs_kernels))
                    [thread.start() for thread in bbs_kernels]
                    self.logger.debug("Waiting for all kernels to complete")
                    [thread.join() for thread in bbs_kernels]

                #         When GlobalControl finishes, our work here is done
                # ----------------------------------------------------------
                self.logger.info("Waiting for GlobalControl thread")
                bbs_control.join()
            finally:
                os.unlink(bbs_parset)
                shutil.rmtree(vds_dir)
                if self.killswitch.isSet():
                    #  If killswitch is set, then one of our processes failed so
                    #                                   the whole run is invalid
                    # ----------------------------------------------------------
                    return 1

        return 0
Exemple #4
0
    def go(self):
        self.logger.info("Starting BBS run")
        super(new_bbs, self).go()

        #                Check for relevant input parameters in the parset-file
        # ---------------------------------------------------------------------
        self.logger.debug("Reading parset from %s" % self.inputs['parset'])
        self.parset = parameterset(self.inputs['parset'])

        self._set_input('db_host', 'BBDB.Host')
        self._set_input('db_user', 'BBDB.User')
        self._set_input('db_name', 'BBDB.Name')
        self._set_input('db_key', 'BBDB.Key')

        #self.logger.debug("self.inputs = %s" % self.inputs)

        #                                         Clean the blackboard database
        # ---------------------------------------------------------------------
        self.logger.info(
            "Cleaning BBS database for key '%s'" % (self.inputs['db_key'])
        )
        command = ["psql",
                   "-h", self.inputs['db_host'],
                   "-U", self.inputs['db_user'],
                   "-d", self.inputs['db_name'],
                   "-c", "DELETE FROM blackboard.session WHERE key='%s';" %
                         self.inputs['db_key']
                  ]
        self.logger.debug(command)
        if subprocess.call(command) != 0:
            self.logger.warning(
                "Failed to clean BBS database for key '%s'" %
                self.inputs['db_key']
            )

        #                  Create a bbs_map describing the file mapping on disk
        # ---------------------------------------------------------------------
        if not self._make_bbs_map():
            return 1

        # Produce a GVDS file, describing the data that must be processed.
        gvds_file = self.run_task(
            "vdsmaker",
            self.inputs['data_mapfile'],
            gvds=self.inputs['gvds']
        )['gvds']

        #      Construct a parset for BBS GlobalControl by patching the GVDS
        #           file and database information into the supplied template
        # ------------------------------------------------------------------
        self.logger.debug("Building parset for BBS control")
        # Create a location for parsets
        job_directory = self.config.get(
                            "layout", "job_directory")
        parset_directory = os.path.join(job_directory, "parsets")
        create_directory(parset_directory)

        # patch the parset and copy result to target location remove tempfile
        try:
            bbs_parset = utilities.patch_parset(
                self.parset,
                {
                    'Observation': gvds_file,
                    'BBDB.Key': self.inputs['db_key'],
                    'BBDB.Name': self.inputs['db_name'],
                    'BBDB.User': self.inputs['db_user'],
                    'BBDB.Host': self.inputs['db_host'],
                    #'BBDB.Port': self.inputs['db_name'],
                }
            )
            bbs_parset_path = os.path.join(parset_directory,
                                           "bbs_control.parset")
            shutil.copyfile(bbs_parset, bbs_parset_path)
            self.logger.debug("BBS control parset is %s" % (bbs_parset_path,))

        finally:
            # Always remove the file in the tempdir
            os.remove(bbs_parset)

        try:
            #        When one of our processes fails, we set the killswitch.
            #      Everything else will then come crashing down, rather than
            #                                         hanging about forever.
            # --------------------------------------------------------------
            self.killswitch = threading.Event()
            self.killswitch.clear()
            signal.signal(signal.SIGTERM, self.killswitch.set)

            #                           GlobalControl runs in its own thread
            # --------------------------------------------------------------
            run_flag = threading.Event()
            run_flag.clear()
            bbs_control = threading.Thread(
                target=self._run_bbs_control,
                args=(bbs_parset, run_flag)
            )
            bbs_control.start()
            run_flag.wait()    # Wait for control to start before proceeding

            #      We run BBS KernelControl on each compute node by directly
            #                             invoking the node script using SSH
            #      Note that we use a job_server to send out job details and
            #           collect logging information, so we define a bunch of
            #    ComputeJobs. However, we need more control than the generic
            #     ComputeJob.dispatch method supplies, so we'll control them
            #                                          with our own threads.
            # --------------------------------------------------------------
            command = "python %s" % (self.__file__.replace('master', 'nodes'))
            jobpool = {}
            bbs_kernels = []
            with job_server(self.logger, jobpool, self.error) as(jobhost,
                                                                   jobport):
                self.logger.debug("Job server at %s:%d" % (jobhost, jobport))
                for job_id, details in enumerate(self.bbs_map):
                    host, files = details
                    jobpool[job_id] = ComputeJob(
                        host, command,
                        arguments=[
                            self.inputs['kernel_exec'],
                            files,
                            self.inputs['db_key'],
                            self.inputs['db_name'],
                            self.inputs['db_user'],
                            self.inputs['db_host']
                        ]
                    )
                    bbs_kernels.append(
                        threading.Thread(
                            target=self._run_bbs_kernel,
                            args=(host, command, job_id, jobhost, str(jobport))
                        )
                    )
                self.logger.info("Starting %d threads" % len(bbs_kernels))
                for thread in bbs_kernels:
                    thread.start()
                self.logger.debug("Waiting for all kernels to complete")
                for thread in bbs_kernels:
                    thread.join()

            #         When GlobalControl finishes, our work here is done
            # ----------------------------------------------------------
            self.logger.info("Waiting for GlobalControl thread")
            bbs_control.join()
        finally:
            os.unlink(bbs_parset)

        if self.killswitch.isSet():
            #  If killswitch is set, then one of our processes failed so
            #                                   the whole run is invalid
            # ----------------------------------------------------------
            return 1

        self.outputs['mapfile'] = self.inputs['data_mapfile']
        return 0
Exemple #5
0
    def _schedule_jobs(self, jobs, max_per_node=None):
        """
        Schedule a series of compute jobs. Blocks until completion.

        :param jobs: iterable of :class:`~lofarpipe.support.remotecommand.ComputeJob` to be scheduled
        :param max_per_node: maximum number of simultaneous jobs on any given node
        :type max_per_node: integer or none
        :rtype: dict mapping integer job id to :class:`~lofarpipe.support.remotecommand.ComputeJob`
        """
        threadpool = []
        jobpool = {}
        if not max_per_node and self.config.has_option('remote',
                                                       'max_per_node'):
            max_per_node = self.config.getint('remote', 'max_per_node')
        limiter = ProcessLimiter(max_per_node)
        killswitch = threading.Event()

        if max_per_node:
            self.logger.info("Limiting to %d simultaneous jobs/node" %
                             max_per_node)

        # External cluster stuff
        try:
            method = self.config.get('remote', 'method')
        except:
            method = None

        redistribute_hosts = False
        # JURECA SLURM
        if method == 'slurm_srun':
            nodeliststr = []
            hargs = ['srun', 'hostname']
            proc = Popen(hargs, False, stdout=PIPE, stderr=None)
            tup = communicate_returning_strings(proc)
            nodeliststr = tup[0].rstrip('\n').split('\n')
            # remove duplicates. order not important
            nodeliststr = list(set(nodeliststr))
            # set flag to re-distribute the hosts for the jobs
            redistribute_hosts = True

        # Hertfordshire cluster
        elif method == 'pbs_ssh':
            # special case - get the list of nodes from the pbs job
            nodeliststr = []
            try:
                filename = os.environ['PBS_NODEFILE']
            except KeyError:
                self.logger.error('PBS_NODEFILE not found.')
                raise PipelineQuit()
            with open(filename, 'r') as file:
                for line in file:
                    node_name = line.split()[0]
                    if node_name not in nodeliststr:
                        nodeliststr.append(node_name)
            # set flag to re-distribute the hosts for the jobs
            redistribute_hosts = True

        # get hostlist from slurm, but start jobs via ssh
        elif method == 'slurm_ssh':
            try:
                hostlist = os.environ['SLURM_JOB_NODELIST']
            except KeyError:
                self.logger.error(
                    'SLURM_JOB_NODELIST not found. You must have a slurm reservation!'
                )
                raise PipelineQuit()
            nodeliststr = expand_slurm_hostlist(hostlist)
            # set flag to re-distribute the hosts for the jobs
            redistribute_hosts = True

        # generic case, node-names in an env-variable
        elif method == 'ssh_generic':
            nodeliststr = []
            try:
                env_name = self.config.get('remote', 'nodelist_variable')
            except:
                env_name = 'PIPELINE_NODES'

            try:
                nodes = os.environ[env_name]
            except KeyError:
                self.logger.error('Env-variable \"' + env_name +
                                  '\" not found.')
                raise PipelineQuit()
            nodeliststr = [
                node.strip() for node in nodes.strip('[] ').split(',')
            ]
            # remove duplicates. order not important
            nodeliststr = list(set(nodeliststr))
            # set flag to re-distribute the hosts for the jobs
            redistribute_hosts = True

        # re-distribute the hosts if requested
        if redistribute_hosts:
            # equal distribution
            total = len(jobs)
            # when nodes crash? length of slurm_nodelist and env slurm_nnodes dont match anymore
            nnodes = len(nodeliststr)
            # round robin
            nodelist = []
            for i in range(total):
                nodelist.append(nodeliststr[i % nnodes])

            for i, job in enumerate(jobs):
                job.host = nodelist[i]

        with job_server(self.logger, jobpool,
                        self.error) as (jobhost, jobport):
            self.logger.debug("Job dispatcher at %s:%d" % (jobhost, jobport))
            for job_id, job in enumerate(jobs):
                jobpool[job_id] = job
                threadpool.append(
                    threading.Thread(target=job.dispatch,
                                     args=(self.logger, self.config, limiter,
                                           job_id, jobhost, jobport,
                                           self.error, killswitch)))
            threadwatcher(threadpool, self.logger, killswitch)

        if killswitch.isSet():
            raise PipelineQuit()

        # Add information regarding specific nodes to an xml node.
        self.logger.debug("Adding node_logging_information")
        local_document = xml.Document()
        node_durations = local_document.createElement("nodes")
        for job_id, job in enumerate(jobs):
            # Test if the duration is there
            # fixme the name of node_durations is not logical
            if "job_duration" in job.results:
                child_node_duration = add_child(node_durations, "job")
                child_node_duration.setAttribute("job_id", str(job_id))
                child_node_duration.setAttribute("job_host", str(job.host))
                child_node_duration.setAttribute(
                    "duration", str(job.results["job_duration"]))

                # return code if present (Not there on error)
                if "returncode" in job.results:
                    child_node_duration.setAttribute(
                        "returncode", str(job.results['returncode']))
                else:
                    child_node_duration.setAttribute("returncode", str(-1))

                ## If there is 'node level' resource logging available
                if "monitor_stats" in job.results:
                    return_node = xml.parseString(
                        job.results['monitor_stats']).documentElement

                    child_node_duration.appendChild(return_node)

        # manually add the result xml as an ingredient output.
        # this allows backward compatible logging: If not read an additional
        # output does not matter
        self.outputs._fields["return_xml"] = ingredient.StringField(
            help="XML return data.")
        self.outputs["return_xml"] = node_durations.toxml(
            encoding="ascii").decode('ascii')

        return jobpool
Exemple #6
0
    def _schedule_jobs(self, jobs, max_per_node=None):
        """
        Schedule a series of compute jobs. Blocks until completion.

        :param jobs: iterable of :class:`~lofarpipe.support.remotecommand.ComputeJob` to be scheduled
        :param max_per_node: maximum number of simultaneous jobs on any given node
        :type max_per_node: integer or none
        :rtype: dict mapping integer job id to :class:`~lofarpipe.support.remotecommand.ComputeJob`
        """
        threadpool = []
        jobpool = {}
        if not max_per_node and self.config.has_option('remote',
                                                       'max_per_node'):
            max_per_node = self.config.getint('remote', 'max_per_node')
        limiter = ProcessLimiter(max_per_node)
        killswitch = threading.Event()

        if max_per_node:
            self.logger.info("Limiting to %d simultaneous jobs/node" %
                             max_per_node)

        with job_server(self.logger, jobpool,
                        self.error) as (jobhost, jobport):
            self.logger.debug("Job dispatcher at %s:%d" % (jobhost, jobport))
            for job_id, job in enumerate(jobs):
                jobpool[job_id] = job
                threadpool.append(
                    threading.Thread(target=job.dispatch,
                                     args=(self.logger, self.config, limiter,
                                           job_id, jobhost, jobport,
                                           self.error, killswitch)))
            threadwatcher(threadpool, self.logger, killswitch)

        if killswitch.isSet():
            raise PipelineQuit()

        # Add information regarding specific nodes to an xml node.
        self.logger.debug("Adding node_logging_information")
        local_document = xml.Document()
        node_durations = local_document.createElement("nodes")
        for job_id, job in enumerate(jobs):
            # Test if the duration is there
            # fixme the name of node_durations is not logical
            if "job_duration" in job.results:
                child_node_duration = add_child(node_durations, "job")
                child_node_duration.setAttribute("job_id", str(job_id))
                child_node_duration.setAttribute("job_host", str(job.host))
                child_node_duration.setAttribute(
                    "duration", str(job.results["job_duration"]))

                # return code if present (Not there on error)
                if "returncode" in job.results:
                    child_node_duration.setAttribute(
                        "returncode", str(job.results['returncode']))
                else:
                    child_node_duration.setAttribute("returncode", str(-1))

                ## If there is 'node level' resource logging available
                if "monitor_stats" in job.results:
                    return_node = xml.parseString(
                        job.results['monitor_stats']).documentElement

                    child_node_duration.appendChild(return_node)

        # manually add the result xml as an ingredient output.
        # this allows backward compatible logging: If not read an additional
        # output does not matter
        self.outputs._fields["return_xml"] = ingredient.StringField(
            help="XML return data.")
        self.outputs["return_xml"] = node_durations.toxml(encoding="ascii")

        return jobpool
Exemple #7
0
    def go(self):
        self.logger.info("Starting BBS run")
        super(bbs, self).go()

        #             Generate source and parameter databases for all input data
        # ----------------------------------------------------------------------
        inputs = LOFARinput(self.inputs)
        inputs['args'] = self.inputs['args']
        inputs['executable'] = self.inputs['parmdbm']
        inputs['working_directory'] = self.config.get(
            "DEFAULT", "default_working_directory"
        )
        inputs['mapfile'] = self.task_definitions.get('parmdb','mapfile')
        inputs['suffix'] = ".instrument"
        outputs = LOFARoutput(self.inputs)
        if self.cook_recipe('parmdb', inputs, outputs):
            self.logger.warn("parmdb reports failure")
            return 1
        inputs['args'] = self.inputs['args']
        inputs['executable'] = self.inputs['makesourcedb']
        inputs['skymodel'] = self.inputs['skymodel']
        inputs['mapfile'] = self.task_definitions.get('sourcedb','mapfile')
        inputs['suffix'] = ".sky"
        outputs = LOFARoutput(self.inputs)
        if self.cook_recipe('sourcedb', inputs, outputs):
            self.logger.warn("sourcedb reports failure")
            return 1

        #              Build a GVDS file describing all the data to be processed
        # ----------------------------------------------------------------------
        self.logger.debug("Building VDS file describing all data for BBS")
        vds_file = os.path.join(
            self.config.get("layout", "job_directory"),
            "vds",
            "bbs.gvds"
        )
        inputs = LOFARinput(self.inputs)
        inputs['args'] = self.inputs['args']
        inputs['gvds'] = vds_file
        inputs['unlink'] = False
        inputs['makevds'] = self.inputs['makevds']
        inputs['combinevds'] = self.inputs['combinevds']
        inputs['nproc'] = self.inputs['nproc']
        inputs['directory'] = os.path.dirname(vds_file)
        outputs = LOFARoutput(self.inputs)
        if self.cook_recipe('vdsmaker', inputs, outputs):
            self.logger.warn("vdsmaker reports failure")
            return 1
        self.logger.debug("BBS GVDS is %s" % (vds_file,))


        #      Iterate over groups of subbands divided up for convenient cluster
        #          procesing -- ie, no more than nproc subbands per compute node
        # ----------------------------------------------------------------------
        for to_process in gvds_iterator(vds_file, int(self.inputs["nproc"])):
            #               to_process is a list of (host, filename, vds) tuples
            # ------------------------------------------------------------------
            hosts, ms_names, vds_files = map(list, zip(*to_process))

            #             The BBS session database should be cleared for our key
            # ------------------------------------------------------------------
            self.logger.debug(
                "Cleaning BBS database for key %s" % (self.inputs["key"])
            )
            with closing(
                psycopg2.connect(
                    host=self.inputs["db_host"],
                    user=self.inputs["db_user"],
                    database=self.inputs["db_name"]
                )
            ) as db_connection:
                db_connection.set_isolation_level(
                    psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT
                )
                with closing(db_connection.cursor()) as db_cursor:
                    db_cursor.execute(
                        "DELETE FROM blackboard.session WHERE key=%s",
                        (self.inputs["key"],)
                    )

            #     BBS GlobalControl requires a GVDS file describing all the data
            #          to be processed. We assemble that from the separate parts
            #                                         already available on disk.
            # ------------------------------------------------------------------
            self.logger.debug("Building VDS file describing data for BBS run")
            vds_dir = tempfile.mkdtemp(suffix=".%s" % (os.path.basename(__file__),))
            vds_file = os.path.join(vds_dir, "bbs.gvds")
            combineproc = utilities.spawn_process(
                [
                    self.inputs['combinevds'],
                    vds_file,
                ] + vds_files,
                self.logger
            )
            sout, serr = combineproc.communicate()
            log_process_output(self.inputs['combinevds'], sout, serr, self.logger)
            if combineproc.returncode != 0:
                raise subprocess.CalledProcessError(
                    combineproc.returncode, command
                )

            #      Construct a parset for BBS GlobalControl by patching the GVDS
            #           file and database information into the supplied template
            # ------------------------------------------------------------------
            self.logger.debug("Building parset for BBS control")
            bbs_parset = utilities.patch_parset(
                self.inputs['parset'],
                {
                    'Observation': vds_file,
                    'BBDB.Key': self.inputs['key'],
                    'BBDB.Name': self.inputs['db_name'],
                    'BBDB.User': self.inputs['db_user'],
                    'BBDB.Host': self.inputs['db_host'],
    #                'BBDB.Port': self.inputs['db_name'],
                }
            )
            self.logger.debug("BBS control parset is %s" % (bbs_parset,))

            try:
                #        When one of our processes fails, we set the killswitch.
                #      Everything else will then come crashing down, rather than
                #                                         hanging about forever.
                # --------------------------------------------------------------
                self.killswitch = threading.Event()
                self.killswitch.clear()
                signal.signal(signal.SIGTERM, self.killswitch.set)

                #                           GlobalControl runs in its own thread
                # --------------------------------------------------------------
                run_flag = threading.Event()
                run_flag.clear()
                bbs_control = threading.Thread(
                    target=self._run_bbs_control,
                    args=(bbs_parset, run_flag)
                )
                bbs_control.start()
                run_flag.wait()    # Wait for control to start before proceeding

                #      We run BBS KernelControl on each compute node by directly
                #                             invoking the node script using SSH
                #      Note that we use a job_server to send out job details and
                #           collect logging information, so we define a bunch of
                #    ComputeJobs. However, we need more control than the generic
                #     ComputeJob.dispatch method supplies, so we'll control them
                #                                          with our own threads.
                # --------------------------------------------------------------
                command = "python %s" % (self.__file__.replace('master', 'nodes'))
                env = {
                    "LOFARROOT": utilities.read_initscript(self.logger, self.inputs['initscript'])["LOFARROOT"],
                    "PYTHONPATH": self.config.get('deploy', 'engine_ppath'),
                    "LD_LIBRARY_PATH": self.config.get('deploy', 'engine_lpath')
                }
                jobpool = {}
                bbs_kernels = []
                with job_server(self.logger, jobpool, self.error) as (jobhost, jobport):
                    self.logger.debug("Job server at %s:%d" % (jobhost, jobport))
                    for job_id, details in enumerate(to_process):
                        host, file, vds = details
                        jobpool[job_id] = ComputeJob(
                            host, command,
                            arguments=[
                                self.inputs['kernel_exec'],
                                self.inputs['initscript'],
                                file,
                                self.inputs['key'],
                                self.inputs['db_name'],
                                self.inputs['db_user'],
                                self.inputs['db_host']
                            ]
                        )
                        bbs_kernels.append(
                            threading.Thread(
                                target=self._run_bbs_kernel,
                                args=(host, command, env, job_id,
                                    jobhost, str(jobport)
                                )
                            )
                        )
                    self.logger.info("Starting %d threads" % len(bbs_kernels))
                    [thread.start() for thread in bbs_kernels]
                    self.logger.debug("Waiting for all kernels to complete")
                    [thread.join() for thread in bbs_kernels]


                #         When GlobalControl finishes, our work here is done
                # ----------------------------------------------------------
                self.logger.info("Waiting for GlobalControl thread")
                bbs_control.join()
            finally:
                os.unlink(bbs_parset)
                shutil.rmtree(vds_dir)
                if self.killswitch.isSet():
                    #  If killswitch is set, then one of our processes failed so
                    #                                   the whole run is invalid
                    # ----------------------------------------------------------
                    return 1

        return 0