Exemple #1
0
    def getStatsAsXmlString(self):
        local_document = xml.Document()
        resource_stat_xml = local_document.createElement("resource_usage")

        if not self.pid_stats:  # if there are no entries in the stats dict
            resource_stat_xml.setAttribute("noStatsRecorded", "true")
            return resource_stat_xml.toxml(encoding="ascii")

        try:
            for idx, (key, value) in enumerate(self.pid_stats.items()):
                #if there are entries
                if value:
                    child_pid = add_child(resource_stat_xml, "process")
                    child_pid.setAttribute("idx", str(idx))
                    #The first entry should contain the exec name. only needed once
                    child_pid.setAttribute("executable", str(value[0][0]))
                    child_pid.setAttribute("pid", str(key))
                    for entry in value:
                        if "MEM" in str(entry[6]):  # this is the default value
                            continue
                        data_point = add_child(child_pid, "data_point")
                        data_point.setAttribute("timestamp", str(entry[1]))
                        data_point.setAttribute("read_bytes", str(entry[2]))
                        data_point.setAttribute("write_bytes", str(entry[3]))
                        data_point.setAttribute("cancelled_bytes",
                                                str(entry[4]))
                        data_point.setAttribute("cpu", str(entry[5]))
                        data_point.setAttribute("mem", str(entry[6]))
        except:
            self.logger.error("monitoring statistic recording failed")
            resource_stat_xml.setAttribute("noStatsRecorded", "Exception")
            return resource_stat_xml.toxml(encoding="ascii")

        return resource_stat_xml.toxml(encoding="ascii")
Exemple #2
0
    def test_get_child(self):
        local_document = xml.Document()
        head = local_document.createElement("head")
        child = xmllogging.add_child(head, "child")
        second_child = xmllogging.add_child(head, "second_child")
        third_child = xmllogging.add_child(head, "child")

        # call the function
        returned_child = get_child(head, "child")

        # test output
        self.assertTrue(returned_child == child,
                "get_child dit not return the first child matching the name")
        self.assertTrue(returned_child != third_child,
                "get_child dit not return the first child matching the name")
Exemple #3
0
    def test_get_child(self):
        local_document = xml.Document()
        head = local_document.createElement("head")
        child = xmllogging.add_child(head, "child")
        second_child = xmllogging.add_child(head, "second_child")
        third_child = xmllogging.add_child(head, "child")

        # call the function
        returned_child = get_child(head, "child")

        # test output
        self.assertTrue(
            returned_child == child,
            "get_child dit not return the first child matching the name")
        self.assertTrue(
            returned_child != third_child,
            "get_child dit not return the first child matching the name")
Exemple #4
0
    def getStatsAsXmlString(self):
        """
        returns the collected data as a xml file
        Data is cleaned and labeled according to the metric used.
        """
        local_document = xml.Document()
        resource_stat_xml = local_document.createElement("resource_usage")
        resource_stat_xml.setAttribute("node_recipe_pid",str(self.owner_pid))

        if not self.pid_stats:  # if there are no entries in the stats dict
            resource_stat_xml.setAttribute("noStatsRecorded", "true")
            return resource_stat_xml.toxml(encoding = "ascii").decode('ascii')
        
        try:
            # TODO: The returned values are not in order and the owner PID
            # might not be printed with idx 0. Maybee print seperately
            for idx,(key,value) in enumerate(self.pid_stats.items()):
                # if there are entries
                if value:  
                    child_pid = add_child(resource_stat_xml, "process")
                    child_pid.setAttribute("idx", str(idx))          
                    # The first entry should contain the exec name.
                    #  only needed once
                    child_pid.setAttribute("executable", str(value[0][0]))
                    child_pid.setAttribute("pid", str(key))
                    for entry in value:
                        # TODO: probably no longer needed with updated bash 
                        # script
                        if "MEM" in str(entry[6]):  # this is the default value
                            continue
                        data_point = add_child(child_pid, "data_point")
                        data_point.setAttribute("timestamp", str(entry[1]))
                        data_point.setAttribute("read_bytes", str(entry[2]))
                        data_point.setAttribute("write_bytes", str(entry[3]))
                        data_point.setAttribute("cancelled_bytes", 
                                                str(entry[4]))
                        data_point.setAttribute("cpu", str(entry[5]))
                        data_point.setAttribute("mem", str(entry[6]))
        except:
            self.logger.warn("monitoring statistic recording failed")
            resource_stat_xml.setAttribute("noStatsRecorded", "Exception")
            # TODO: coalesce these two returns in one "finally:"
            return resource_stat_xml.toxml(encoding = "ascii").decode('ascii')

        return resource_stat_xml.toxml(encoding = "ascii").decode('ascii')
Exemple #5
0
    def test_add_child(self):
        local_document = xml.Document()
        head = local_document.createElement("head")
        returned_node = xmllogging.add_child(head, "child")

        self.assertTrue(
            len(head.childNodes) == 1, "add_child add more then one child")
        self.assertTrue(head.childNodes[0].tagName == "child",
                        "add_child added a child with an incorrect name")
        self.assertTrue(returned_node == head.childNodes[0],
                        "add_child should return the created node")
Exemple #6
0
    def test_get_child_not_found(self):
        local_document = xml.Document()
        head = local_document.createElement("head")
        child = xmllogging.add_child(head, "child")

        # call the function
        returned_child = get_child(head, "does_not_exist")

        # test output
        self.assertTrue(returned_child == None,
                "when no children are found get_child should return None")
Exemple #7
0
    def test_get_child_not_found(self):
        local_document = xml.Document()
        head = local_document.createElement("head")
        child = xmllogging.add_child(head, "child")

        # call the function
        returned_child = get_child(head, "does_not_exist")

        # test output
        self.assertTrue(
            returned_child == None,
            "when no children are found get_child should return None")
Exemple #8
0
    def test_add_child(self):
        local_document = xml.Document()
        head = local_document.createElement("head")
        returned_node = xmllogging.add_child(head, "child")


        self.assertTrue(len(head.childNodes) == 1,
                         "add_child add more then one child")
        self.assertTrue(head.childNodes[0].tagName == "child",
                        "add_child added a child with an incorrect name")
        self.assertTrue(returned_node == head.childNodes[0],
                        "add_child should return the created node")
    def _schedule_jobs(self, jobs, max_per_node = None):
        """
        Schedule a series of compute jobs. Blocks until completion.

        :param jobs: iterable of :class:`~lofarpipe.support.remotecommand.ComputeJob` to be scheduled
        :param max_per_node: maximum number of simultaneous jobs on any given node
        :type max_per_node: integer or none
        :rtype: dict mapping integer job id to :class:`~lofarpipe.support.remotecommand.ComputeJob`
        """
        threadpool = []
        jobpool = {}
        if not max_per_node and self.config.has_option('remote', 'max_per_node'):
            max_per_node = self.config.getint('remote', 'max_per_node')
        limiter = ProcessLimiter(max_per_node)
        killswitch = threading.Event()

        if max_per_node:
            self.logger.info("Limiting to %d simultaneous jobs/node" % max_per_node)

        with job_server(self.logger, jobpool, self.error) as (jobhost, jobport):
            self.logger.debug("Job dispatcher at %s:%d" % (jobhost, jobport))
            for job_id, job in enumerate(jobs):
                jobpool[job_id] = job
                threadpool.append(
                    threading.Thread(
                        target = job.dispatch,
                        args = (
                            self.logger, self.config, limiter, job_id,
                            jobhost, jobport, self.error, killswitch
                        )
                    )
                )
            threadwatcher(threadpool, self.logger, killswitch)

        if killswitch.isSet():
            raise PipelineQuit()

        # Add information regarding specific nodes to an xml node.
        self.logger.debug("Adding node_logging_information")
        local_document = xml.Document()
        node_durations = local_document.createElement("nodes")
        for job_id, job in enumerate(jobs):
            # Test if the duration is there
            if "job_duration" in job.results:
                child_node_duration = add_child(node_durations, "job")
                child_node_duration.setAttribute("job_id", str(job_id))
                child_node_duration.setAttribute("duration",
                     str(job.results["job_duration"]))
                # return code if present (Not there on error
                if "returncode" in job.results:
                    child_node_duration.setAttribute(
                        "returncode", str(job.results['returncode']))
                else:
                    child_node_duration.setAttribute(
                        "returncode", str(-1))


        # manually add the result xml as an ingredient output.
        # this allows backward compatible logging: If not read an additional
        # output does not matter
        self.outputs._fields["return_xml"] = ingredient.StringField(
                                                help = "XML return data.")
        self.outputs["return_xml"] = node_durations.toxml(encoding = "ascii")

        return jobpool
    def _schedule_jobs(self, jobs, max_per_node = None):
        """
        Schedule a series of compute jobs. Blocks until completion.

        :param jobs: iterable of :class:`~lofarpipe.support.remotecommand.ComputeJob` to be scheduled
        :param max_per_node: maximum number of simultaneous jobs on any given node
        :type max_per_node: integer or none
        :rtype: dict mapping integer job id to :class:`~lofarpipe.support.remotecommand.ComputeJob`
        """
        threadpool = []
        jobpool = {}
        if not max_per_node and self.config.has_option('remote', 'max_per_node'):
            max_per_node = self.config.getint('remote', 'max_per_node')
        limiter = ProcessLimiter(max_per_node)
        killswitch = threading.Event()

        if max_per_node:
            self.logger.info("Limiting to %d simultaneous jobs/node" % max_per_node)

        # External cluster stuff
        try:
            method = self.config.get('remote', 'method')
        except:
            method = None

        redistribute_hosts = False
        # JURECA SLURM
        if method == 'slurm_srun':
            nodeliststr = []
            hargs = ['srun','hostname']
            proc = Popen(hargs, False, stdout=PIPE, stderr=None)
            tup = proc.communicate()
            nodeliststr = tup[0].rstrip('\n').split('\n')
            # remove duplicates. order not important
            nodeliststr = list(set(nodeliststr))
            # set flag to re-distribute the hosts for the jobs
            redistribute_hosts = True

        # Hertfordshire cluster
        elif method == 'pbs_ssh':
            # special case - get the list of nodes from the pbs job
            nodeliststr = []
            try:
                filename = os.environ['PBS_NODEFILE']
            except KeyError:
                self.logger.error('PBS_NODEFILE not found.')
                raise PipelineQuit()
            with open(filename, 'r') as file:
                for line in file:
                    node_name = line.split()[0]
                    if node_name not in nodeliststr:
                        nodeliststr.append(node_name)
            # set flag to re-distribute the hosts for the jobs
            redistribute_hosts = True

        # get hostlist from slurm, but start jobs via ssh
        elif method == 'slurm_ssh':
            try:
                hostlist = os.environ['SLURM_JOB_NODELIST']
            except KeyError:
                self.logger.error('SLURM_JOB_NODELIST not found. You must have a slurm reservation!')
                raise PipelineQuit()
            nodeliststr = expand_slurm_hostlist(hostlist)
            # set flag to re-distribute the hosts for the jobs
            redistribute_hosts = True

        # generic case, node-names in an env-variable
        elif method == 'ssh_generic':
            nodeliststr = []
            try:
                env_name = self.config.get('remote', 'nodelist_variable')
            except:
                env_name = 'PIPELINE_NODES'

            try:
                nodes = os.environ[env_name]
            except KeyError:
                self.logger.error('Env-variable \"'+env_name+'\" not found.')
                raise PipelineQuit()
            nodeliststr = [node.strip() for node in nodes.strip('[] ').split(',')]
            # remove duplicates. order not important
            nodeliststr = list(set(nodeliststr))
            # set flag to re-distribute the hosts for the jobs
            redistribute_hosts = True

        # re-distribute the hosts if requested
        if redistribute_hosts:
            # equal distribution
            total = len(jobs)
            # when nodes crash? length of slurm_nodelist and env slurm_nnodes dont match anymore
            nnodes = len(nodeliststr)
            # round robin
            nodelist = []
            for i in range(total):
                nodelist.append(nodeliststr[i%nnodes])

            for i, job in enumerate(jobs):
                job.host = nodelist[i]

        with job_server(self.logger, jobpool, self.error) as (jobhost, jobport):
            self.logger.debug("Job dispatcher at %s:%d" % (jobhost, jobport))
            for job_id, job in enumerate(jobs):
                jobpool[job_id] = job
                threadpool.append(
                    threading.Thread(
                        target = job.dispatch,
                        args = (
                            self.logger, self.config, limiter, job_id,
                            jobhost, jobport, self.error, killswitch
                        )
                    )
                )
            threadwatcher(threadpool, self.logger, killswitch)

        if killswitch.isSet():
            raise PipelineQuit()

        # Add information regarding specific nodes to an xml node.
        self.logger.debug("Adding node_logging_information")
        local_document = xml.Document()
        node_durations = local_document.createElement("nodes")
        for job_id, job in enumerate(jobs):
            # Test if the duration is there
            # fixme the name of node_durations is not logical
            if "job_duration" in job.results:
                child_node_duration = add_child(node_durations, "job")
                child_node_duration.setAttribute("job_id", str(job_id))
                child_node_duration.setAttribute("job_host", str(job.host))
                child_node_duration.setAttribute("duration",
                     str(job.results["job_duration"]))

                # return code if present (Not there on error)
                if "returncode" in job.results:
                    child_node_duration.setAttribute(
                        "returncode", str(job.results['returncode']))
                else:
                    child_node_duration.setAttribute(
                        "returncode", str(-1))

                ## If there is 'node level' resource logging available
                if "monitor_stats" in job.results:
                      return_node = xml.parseString(
                          job.results['monitor_stats']).documentElement

                      child_node_duration.appendChild(return_node)


        # manually add the result xml as an ingredient output.
        # this allows backward compatible logging: If not read an additional
        # output does not matter
        self.outputs._fields["return_xml"] = ingredient.StringField(
                                                help = "XML return data.")
        self.outputs["return_xml"] = node_durations.toxml(encoding = "ascii")

        return jobpool
Exemple #11
0
    def _schedule_jobs(self, jobs, max_per_node=None):
        """
        Schedule a series of compute jobs. Blocks until completion.

        :param jobs: iterable of :class:`~lofarpipe.support.remotecommand.ComputeJob` to be scheduled
        :param max_per_node: maximum number of simultaneous jobs on any given node
        :type max_per_node: integer or none
        :rtype: dict mapping integer job id to :class:`~lofarpipe.support.remotecommand.ComputeJob`
        """
        threadpool = []
        jobpool = {}
        if not max_per_node and self.config.has_option('remote',
                                                       'max_per_node'):
            max_per_node = self.config.getint('remote', 'max_per_node')
        limiter = ProcessLimiter(max_per_node)
        killswitch = threading.Event()

        if max_per_node:
            self.logger.info("Limiting to %d simultaneous jobs/node" %
                             max_per_node)

        # External cluster stuff
        try:
            method = self.config.get('remote', 'method')
        except:
            method = None

        redistribute_hosts = False
        # JURECA SLURM
        if method == 'slurm_srun':
            nodeliststr = []
            hargs = ['srun', 'hostname']
            proc = Popen(hargs, False, stdout=PIPE, stderr=None)
            tup = communicate_returning_strings(proc)
            nodeliststr = tup[0].rstrip('\n').split('\n')
            # remove duplicates. order not important
            nodeliststr = list(set(nodeliststr))
            # set flag to re-distribute the hosts for the jobs
            redistribute_hosts = True

        # Hertfordshire cluster
        elif method == 'pbs_ssh':
            # special case - get the list of nodes from the pbs job
            nodeliststr = []
            try:
                filename = os.environ['PBS_NODEFILE']
            except KeyError:
                self.logger.error('PBS_NODEFILE not found.')
                raise PipelineQuit()
            with open(filename, 'r') as file:
                for line in file:
                    node_name = line.split()[0]
                    if node_name not in nodeliststr:
                        nodeliststr.append(node_name)
            # set flag to re-distribute the hosts for the jobs
            redistribute_hosts = True

        # get hostlist from slurm, but start jobs via ssh
        elif method == 'slurm_ssh':
            try:
                hostlist = os.environ['SLURM_JOB_NODELIST']
            except KeyError:
                self.logger.error(
                    'SLURM_JOB_NODELIST not found. You must have a slurm reservation!'
                )
                raise PipelineQuit()
            nodeliststr = expand_slurm_hostlist(hostlist)
            # set flag to re-distribute the hosts for the jobs
            redistribute_hosts = True

        # generic case, node-names in an env-variable
        elif method == 'ssh_generic':
            nodeliststr = []
            try:
                env_name = self.config.get('remote', 'nodelist_variable')
            except:
                env_name = 'PIPELINE_NODES'

            try:
                nodes = os.environ[env_name]
            except KeyError:
                self.logger.error('Env-variable \"' + env_name +
                                  '\" not found.')
                raise PipelineQuit()
            nodeliststr = [
                node.strip() for node in nodes.strip('[] ').split(',')
            ]
            # remove duplicates. order not important
            nodeliststr = list(set(nodeliststr))
            # set flag to re-distribute the hosts for the jobs
            redistribute_hosts = True

        # re-distribute the hosts if requested
        if redistribute_hosts:
            # equal distribution
            total = len(jobs)
            # when nodes crash? length of slurm_nodelist and env slurm_nnodes dont match anymore
            nnodes = len(nodeliststr)
            # round robin
            nodelist = []
            for i in range(total):
                nodelist.append(nodeliststr[i % nnodes])

            for i, job in enumerate(jobs):
                job.host = nodelist[i]

        with job_server(self.logger, jobpool,
                        self.error) as (jobhost, jobport):
            self.logger.debug("Job dispatcher at %s:%d" % (jobhost, jobport))
            for job_id, job in enumerate(jobs):
                jobpool[job_id] = job
                threadpool.append(
                    threading.Thread(target=job.dispatch,
                                     args=(self.logger, self.config, limiter,
                                           job_id, jobhost, jobport,
                                           self.error, killswitch)))
            threadwatcher(threadpool, self.logger, killswitch)

        if killswitch.isSet():
            raise PipelineQuit()

        # Add information regarding specific nodes to an xml node.
        self.logger.debug("Adding node_logging_information")
        local_document = xml.Document()
        node_durations = local_document.createElement("nodes")
        for job_id, job in enumerate(jobs):
            # Test if the duration is there
            # fixme the name of node_durations is not logical
            if "job_duration" in job.results:
                child_node_duration = add_child(node_durations, "job")
                child_node_duration.setAttribute("job_id", str(job_id))
                child_node_duration.setAttribute("job_host", str(job.host))
                child_node_duration.setAttribute(
                    "duration", str(job.results["job_duration"]))

                # return code if present (Not there on error)
                if "returncode" in job.results:
                    child_node_duration.setAttribute(
                        "returncode", str(job.results['returncode']))
                else:
                    child_node_duration.setAttribute("returncode", str(-1))

                ## If there is 'node level' resource logging available
                if "monitor_stats" in job.results:
                    return_node = xml.parseString(
                        job.results['monitor_stats']).documentElement

                    child_node_duration.appendChild(return_node)

        # manually add the result xml as an ingredient output.
        # this allows backward compatible logging: If not read an additional
        # output does not matter
        self.outputs._fields["return_xml"] = ingredient.StringField(
            help="XML return data.")
        self.outputs["return_xml"] = node_durations.toxml(
            encoding="ascii").decode('ascii')

        return jobpool
Exemple #12
0
    def _schedule_jobs(self, jobs, max_per_node=None):
        """
        Schedule a series of compute jobs. Blocks until completion.

        :param jobs: iterable of :class:`~lofarpipe.support.remotecommand.ComputeJob` to be scheduled
        :param max_per_node: maximum number of simultaneous jobs on any given node
        :type max_per_node: integer or none
        :rtype: dict mapping integer job id to :class:`~lofarpipe.support.remotecommand.ComputeJob`
        """
        threadpool = []
        jobpool = {}
        if not max_per_node and self.config.has_option('remote',
                                                       'max_per_node'):
            max_per_node = self.config.getint('remote', 'max_per_node')
        limiter = ProcessLimiter(max_per_node)
        killswitch = threading.Event()

        if max_per_node:
            self.logger.info("Limiting to %d simultaneous jobs/node" %
                             max_per_node)

        with job_server(self.logger, jobpool,
                        self.error) as (jobhost, jobport):
            self.logger.debug("Job dispatcher at %s:%d" % (jobhost, jobport))
            for job_id, job in enumerate(jobs):
                jobpool[job_id] = job
                threadpool.append(
                    threading.Thread(target=job.dispatch,
                                     args=(self.logger, self.config, limiter,
                                           job_id, jobhost, jobport,
                                           self.error, killswitch)))
            threadwatcher(threadpool, self.logger, killswitch)

        if killswitch.isSet():
            raise PipelineQuit()

        # Add information regarding specific nodes to an xml node.
        self.logger.debug("Adding node_logging_information")
        local_document = xml.Document()
        node_durations = local_document.createElement("nodes")
        for job_id, job in enumerate(jobs):
            # Test if the duration is there
            # fixme the name of node_durations is not logical
            if "job_duration" in job.results:
                child_node_duration = add_child(node_durations, "job")
                child_node_duration.setAttribute("job_id", str(job_id))
                child_node_duration.setAttribute("job_host", str(job.host))
                child_node_duration.setAttribute(
                    "duration", str(job.results["job_duration"]))

                # return code if present (Not there on error)
                if "returncode" in job.results:
                    child_node_duration.setAttribute(
                        "returncode", str(job.results['returncode']))
                else:
                    child_node_duration.setAttribute("returncode", str(-1))

                ## If there is 'node level' resource logging available
                if "monitor_stats" in job.results:
                    return_node = xml.parseString(
                        job.results['monitor_stats']).documentElement

                    child_node_duration.appendChild(return_node)

        # manually add the result xml as an ingredient output.
        # this allows backward compatible logging: If not read an additional
        # output does not matter
        self.outputs._fields["return_xml"] = ingredient.StringField(
            help="XML return data.")
        self.outputs["return_xml"] = node_durations.toxml(encoding="ascii")

        return jobpool