Example #1
0
class SupDExe(object):

    def __init__(self, eeagent_cfg):
        self._working_dir = eeagent_cfg.launch_type.supd_directory
        self._eename = eeagent_cfg.name
        supdexe = _set_param_or_default(eeagent_cfg.launch_type, 'supdexe', None)
        self._slots = int(eeagent_cfg.slots)
        self._factory = SupDPidanticFactory(directory=self._working_dir, name=self._eename, supdexe=supdexe)
        pidantic_instances = self._factory.reload_instances()
        self._known_pws = {}
        for name in pidantic_instances:
            pidantic = pidantic_instances[name]
            pw = PidWrapper(self, name)
            pw.set_pidantic(pidantic)
            self._known_pws[name] = pw

    def run(self, name, parameters):
        pw = PidWrapper(self, name)
        self._known_pws[name] = pw
        command = parameters['exec'] + " " + ' '.join(parameters['argv'])

        dir = self._working_dir
        if "working_directory" in parameters:
            dir = parameters["working_directory"]
        pid = self._factory.get_pidantic(command=command, process_name=name, directory=dir)
        pw.set_pidantic(pid)

        running_jobs = self._get_running()
        x = len(running_jobs)
        if x <= self._slots:
            pid.start()
        else:
            pid.cancel_request()
        return pw

    def get_known_pws(self):
        return self._known_pws
        
    def _remove_proc(self, proc_name):
        del self._known_pws[proc_name]

    def lookup_id(self, name):
        if name not in self._known_pws:
            return None
        return self._known_pws[name]

    def get_all(self):
        return self._known_pws
        
    def _get_running(self):
        running_states = [PidWrapper.RUNNING, PidWrapper.TERMINATING, PidWrapper.REQUESTING]
        a = self.get_all().values()
        running = [i.get_state() for i in a]

        running = [i for i in a if i.get_state() in running_states]
        return running

    def poll(self):
        return self._factory.poll()

    def terminate(self):
        self._factory.terminate()
Example #2
0
class BrickWriterDispatcher(object):

    def __init__(self, failure_callback, num_workers=1, pidantic_dir=None, working_dir=None):
        self.guid = create_guid()
        self.prep_queue = queue.Queue()
        self.work_queue = queue.Queue()
        self._pending_work = {}
        self._stashed_work = {}
        self._active_work = {}
        self._failures = {}
        self._do_stop = False
        self._count = -1
        self._shutdown = False
        self._failure_callback = failure_callback

        self.context = zmq.Context(1)
        self.prov_sock = self.context.socket(zmq.REP)
        self.prov_port = self._get_port(self.prov_sock)
        log.info('Provisioning url: tcp://*:{0}'.format(self.prov_port))

        self.resp_sock = self.context.socket(zmq.SUB)
        self.resp_port = self._get_port(self.resp_sock)
        self.resp_sock.setsockopt(zmq.SUBSCRIBE, '')
        log.info('Response url: tcp://*:{0}'.format(self.resp_port))

        self.num_workers = num_workers if num_workers > 0 else 1
        self.is_single_worker = self.num_workers == 1
        self.working_dir = working_dir or '.'
        self.pidantic_dir = pidantic_dir or './pid_dir'
        self.workers = []

        self._configure_workers()

    def _get_port(self, socket):
        for x in xrange(PORT_RANGE[0], PORT_RANGE[1]):
            try:
                socket.bind('tcp://*:{0}'.format(x))
                return x
            except ZMQError:
                continue

    def _configure_workers(self):
        # TODO: if num_workers == 1, simply run one in-line (runs in a greenlet anyhow)
        if self.is_single_worker:
            from brick_worker import run_worker
            worker = run_worker(self.prov_port, self.resp_port)
            self.workers.append(worker)
        else:
            if os.path.exists(self.pidantic_dir):
                bdp = os.path.join(self.pidantic_dir, 'brick_dispatch')
                if os.path.exists(bdp):
                    import zipfile, zlib
                    with zipfile.ZipFile(os.path.join(bdp, 'archived_worker_logs.zip'), 'a', zipfile.ZIP_DEFLATED) as f:
                        names = f.namelist()
                        for x in [x for x in os.listdir(bdp) if x.startswith('worker_') and x not in names]:
                            fn = os.path.join(bdp, x)
                            f.write(filename=fn, arcname=x)
                            os.remove(fn)

            else:
                os.makedirs(self.pidantic_dir)

            self.factory = SupDPidanticFactory(name='brick_dispatch', directory=self.pidantic_dir)
            # Check for old workers - FOR NOW, TERMINATE THEM TODO: These should be reusable...
            old_workers = self.factory.reload_instances()
            for x in old_workers:
                old_workers[x].cleanup()

            worker_cmd = 'bin/python coverage_model/brick_worker.py {0} {1}'.format(self.prov_port, self.resp_port)
            for x in xrange(self.num_workers):
                w = self.factory.get_pidantic(command=worker_cmd, process_name='worker_{0}'.format(x), directory=os.path.realpath(self.working_dir))
                w.start()
                self.workers.append(w)

            ready=False
            while not ready:
                self.factory.poll()
                for x in self.workers:
                    s = x.get_state()
                    if s is PIDanticState.STATE_STARTING:
                        break
                    elif s is PIDanticState.STATE_RUNNING:
                        continue
                    elif s is PIDanticState.STATE_EXITED:
                        self.shutdown()
                        raise SystemError('Error starting worker - cannot continue')
                    else:
                        raise SystemError('Problem starting worker - cannot continue')

                ready = True

    def has_pending_work(self):
        return len(self._pending_work) > 0

    def has_active_work(self):
        return len(self._active_work) > 0

    def has_stashed_work(self):
        return len(self._stashed_work) > 0

    def is_dirty(self):
        if not self.has_active_work():
            if not self.has_stashed_work():
                if not self.has_pending_work():
                    return False

        return True

    def get_dirty_values_async_result(self):
        dirty_async_res = AsyncResult()
        def dirty_check(self, res):
            while True:
                if self.is_dirty():
                    time.sleep(0.1)
                else:
                    res.set(True)
                    break

        spawn(dirty_check, self, dirty_async_res)

        return dirty_async_res

    def run(self):
        self._do_stop = False
        self._org_g = spawn(self.organize_work)

        self._prov_g = spawn(self.provisioner)
        self._rec_g = spawn(self.receiver)

    def shutdown(self, force=False, timeout=None):
        if self._shutdown:
            return
        # CBM TODO: Revisit to ensure this won't strand work or terminate workers before they complete their work...!!
        self._do_stop = True
        try:
            log.debug('Force == %s', force)
            if not force:
                log.debug('Waiting for organizer; timeout == %s',timeout)
                # Wait for the organizer to finish - ensures the prep_queue is empty
                self._org_g.join(timeout=timeout)

                log.debug('Waiting for provisioner; timeout == %s',timeout)
                # Wait for the provisioner to finish - ensures work_queue is empty
                self._prov_g.join(timeout=timeout)

                log.debug('Waiting for receiver; timeout == %s',timeout)
                # Wait for the receiver to finish - allows workers to finish their work
                self._rec_g.join(timeout=timeout)

            log.debug('Killing organizer, provisioner, and receiver greenlets')
            # Terminate the greenlets
            self._org_g.kill()
            self._prov_g.kill()
            self._rec_g.kill()
            log.debug('Greenlets killed')

            log.debug('Shutdown workers')
            # Shutdown workers - work should be completed by now...
            if self.is_single_worker:
                # Current work will be finished
                self.workers[0].stop()
            else:
                self.workers = self.factory.reload_instances()
                # CBM TODO:  THIS DOES NOT ALLOW CURRENT WORK TO FINISH!!!
                for x in self.workers:
                    self.workers[x].cleanup()
                self.factory.terminate()
            log.debug('Workers shutdown')
        except:
            raise
        finally:
            log.debug('Closing provisioner and receiver sockets')
            # Close sockets
            self.prov_sock.close()
            self.resp_sock.close()
            log.debug('Sockets closed')
            log.debug('Terminating the context')
            self.context.term()
            log.debug('Context terminated')

            self._shutdown = True

    def organize_work(self):
        while True:
            if self._do_stop and self.prep_queue.empty():
                break
            try:
                # Timeout after 1 second to allow stopage and _stashed_work cleanup
                wd = self.prep_queue.get(timeout=1)
            except queue.Empty:
                # No new work added - see if there's anything on the stash to cleanup...
                for k in self._stashed_work:
                    log.debug('Cleanup _stashed_work...')
                    # Just want to trigger cleanup of the _stashed_work, pass an empty list of 'work', gets discarded
                    self.put_work(k, self._stashed_work[k][0], [])
                continue

            try:
                k, wm, w = wd
                is_list = isinstance(w, list)
                if k not in self._stashed_work and len(w) == 0:
                    log.debug('Discarding empty work')
                    continue

                log.debug('Work: %s',w)

                if k in self._active_work:
                    log.debug('Do Stash')
                    # The work_key is being worked on
                    if k not in self._stashed_work:
                        # Create the stash for this work_key
                        self._stashed_work[k] = (wm, [])

                    # Add the work to the stash
                    if is_list:
                        self._stashed_work[k][1].extend(w[:])
                    else:
                        self._stashed_work[k][1].append(w)
                else:
                    # If there is a stash for this work_key, prepend it to work
                    if k in self._stashed_work:
                        log.debug('Was a stash, prepend: %s, %s', self._stashed_work[k], w)
                        _, sv=self._stashed_work.pop(k)
                        if is_list:
                            sv.extend(w[:])
                        else:
                            sv.append(w)
                        w = sv
                        is_list = True # Work is a list going forward!!

                    log.debug('Work: %s',w)

                    # The work_key is not yet pending
                    not_in_pend = k not in self._pending_work

                    if not_in_pend:
                        # Create the pending for this work_key
                        log.debug('-> new pointer \'%s\'', k)
                        self._pending_work[k] = (wm, [])

                    # Add the work to the pending
                    log.debug('-> adding work to \'%s\': %s', k, w)
                    if is_list:
                        self._pending_work[k][1].extend(w[:])
                    else:
                        self._pending_work[k][1].append(w)

                    if not_in_pend:
                        # Add the not-yet-pending work to the work_queue
                        self.work_queue.put(k)
            except:
                raise


    def put_work(self, work_key, work_metrics, work):
        if self._shutdown:
            raise SystemError('This BrickDispatcher has been shutdown and cannot process more work!')
        self.prep_queue.put((work_key, work_metrics, work))

    def _add_failure(self, wp):
        pwp = pack(wp)
        log.warn('Adding to _failures: %s', pwp)
        if pwp in self._failures:
            self._failures[pwp] += 1
        else:
            self._failures[pwp] = 1

        if self._failures[pwp] > WORK_FAILURE_RETRIES:
            raise ValueError('Maximum failure retries exceeded')

    def receiver(self):
        while True:
            try:
                if self.resp_sock.closed:
                    break
                if self._do_stop and len(self._active_work) == 0:
                    break

                log.debug('Receive response message (loop)')
                msg = None
                while msg is None:
                    try:
                        msg = self.resp_sock.recv(zmq.NOBLOCK)
                    except zmq.ZMQError, e:
                        if e.errno == zmq.EAGAIN:
                            if self._do_stop:
                                break
                            else:
                                time.sleep(0.1)
                        else:
                            raise

                if msg is not None:
                    resp_type, worker_guid, work_key, work = unpack(msg)
                    work = list(work) if work is not None else work
                    if resp_type == SUCCESS:
                        log.debug('Worker %s was successful', worker_guid)
                        wguid, pw = self._active_work.pop(work_key)
                        if pw in self._failures:
                            self._failures.pop(pw)
                    elif resp_type == FAILURE:
                        log.debug('Failure reported for work on %s by worker %s', work_key, worker_guid)
                        if work_key is None:
                            # Worker failed before it did anything, put all work back on the prep queue to be reorganized by the organizer
                            # Because it failed so miserably, need to find the work_key based on guid
                            for k, v in self._active_work.iteritems():
                                if v[0] == worker_guid:
                                    work_key = k
                                    break

                            if work_key is not None:
                                wguid, pw = self._active_work.pop(work_key)
                                try:
                                    self._add_failure(pw)
                                except ValueError,e:
                                    self._failure_callback(e.message, unpack(pw))
                                    continue

                                self.put_work(*unpack(pw))
                        else:
                            # Normal failure
                            # Pop the work from active work, and queue the work returned by the worker
                            wguid, pw = self._active_work.pop(work_key)
                            try:
                                self._add_failure(pw)
                            except ValueError,e:
                                self._failure_callback(e.message, unpack(pw))
                                continue
                            _, wm, wk = unpack(pw)
                            self.put_work(work_key, wm, work)
class BrickWriterDispatcher(object):
    def __init__(self,
                 failure_callback,
                 num_workers=1,
                 pidantic_dir=None,
                 working_dir=None):
        self.guid = create_guid()
        self.prep_queue = queue.Queue()
        self.work_queue = queue.Queue()
        self._pending_work = {}
        self._stashed_work = {}
        self._active_work = {}
        self._failures = {}
        self._do_stop = False
        self._count = -1
        self._shutdown = False
        self._failure_callback = failure_callback

        self.context = zmq.Context(1)
        self.prov_sock = self.context.socket(zmq.REP)
        self.prov_port = self._get_port(self.prov_sock)
        log.info('Provisioning url: tcp://*:{0}'.format(self.prov_port))

        self.resp_sock = self.context.socket(zmq.SUB)
        self.resp_port = self._get_port(self.resp_sock)
        self.resp_sock.setsockopt(zmq.SUBSCRIBE, '')
        log.info('Response url: tcp://*:{0}'.format(self.resp_port))

        self.num_workers = num_workers if num_workers > 0 else 1
        self.is_single_worker = self.num_workers == 1
        self.working_dir = working_dir or '.'
        self.pidantic_dir = pidantic_dir or './pid_dir'
        self.workers = []

        self._configure_workers()

    def _get_port(self, socket):
        for x in xrange(PORT_RANGE[0], PORT_RANGE[1]):
            try:
                socket.bind('tcp://*:{0}'.format(x))
                return x
            except ZMQError:
                continue

    def _configure_workers(self):
        # TODO: if num_workers == 1, simply run one in-line (runs in a greenlet anyhow)
        if self.is_single_worker:
            from brick_worker import run_worker
            worker = run_worker(self.prov_port, self.resp_port)
            self.workers.append(worker)
        else:
            if os.path.exists(self.pidantic_dir):
                bdp = os.path.join(self.pidantic_dir, 'brick_dispatch')
                if os.path.exists(bdp):
                    import zipfile, zlib
                    with zipfile.ZipFile(
                            os.path.join(bdp, 'archived_worker_logs.zip'), 'a',
                            zipfile.ZIP_DEFLATED) as f:
                        names = f.namelist()
                        for x in [
                                x for x in os.listdir(bdp)
                                if x.startswith('worker_') and x not in names
                        ]:
                            fn = os.path.join(bdp, x)
                            f.write(filename=fn, arcname=x)
                            os.remove(fn)

            else:
                os.makedirs(self.pidantic_dir)

            self.factory = SupDPidanticFactory(name='brick_dispatch',
                                               directory=self.pidantic_dir)
            # Check for old workers - FOR NOW, TERMINATE THEM TODO: These should be reusable...
            old_workers = self.factory.reload_instances()
            for x in old_workers:
                old_workers[x].cleanup()

            worker_cmd = 'bin/python coverage_model/brick_worker.py {0} {1}'.format(
                self.prov_port, self.resp_port)
            for x in xrange(self.num_workers):
                w = self.factory.get_pidantic(
                    command=worker_cmd,
                    process_name='worker_{0}'.format(x),
                    directory=os.path.realpath(self.working_dir))
                w.start()
                self.workers.append(w)

            ready = False
            while not ready:
                self.factory.poll()
                for x in self.workers:
                    s = x.get_state()
                    if s is PIDanticState.STATE_STARTING:
                        break
                    elif s is PIDanticState.STATE_RUNNING:
                        continue
                    elif s is PIDanticState.STATE_EXITED:
                        self.shutdown()
                        raise SystemError(
                            'Error starting worker - cannot continue')
                    else:
                        raise SystemError(
                            'Problem starting worker - cannot continue')

                ready = True

    def has_pending_work(self):
        return len(self._pending_work) > 0

    def has_active_work(self):
        return len(self._active_work) > 0

    def has_stashed_work(self):
        return len(self._stashed_work) > 0

    def is_dirty(self):
        if not self.has_active_work():
            if not self.has_stashed_work():
                if not self.has_pending_work():
                    return False

        return True

    def get_dirty_values_async_result(self):
        dirty_async_res = AsyncResult()

        def dirty_check(self, res):
            while True:
                if self.is_dirty():
                    time.sleep(0.1)
                else:
                    res.set(True)
                    break

        spawn(dirty_check, self, dirty_async_res)

        return dirty_async_res

    def run(self):
        self._do_stop = False
        self._org_g = spawn(self.organize_work)

        self._prov_g = spawn(self.provisioner)
        self._rec_g = spawn(self.receiver)

    def shutdown(self, force=False, timeout=None):
        if self._shutdown:
            return
        # CBM TODO: Revisit to ensure this won't strand work or terminate workers before they complete their work...!!
        self._do_stop = True
        try:
            log.debug('Force == %s', force)
            if not force:
                log.debug('Waiting for organizer; timeout == %s', timeout)
                # Wait for the organizer to finish - ensures the prep_queue is empty
                self._org_g.join(timeout=timeout)

                log.debug('Waiting for provisioner; timeout == %s', timeout)
                # Wait for the provisioner to finish - ensures work_queue is empty
                self._prov_g.join(timeout=timeout)

                log.debug('Waiting for receiver; timeout == %s', timeout)
                # Wait for the receiver to finish - allows workers to finish their work
                self._rec_g.join(timeout=timeout)

            log.debug('Killing organizer, provisioner, and receiver greenlets')
            # Terminate the greenlets
            self._org_g.kill()
            self._prov_g.kill()
            self._rec_g.kill()
            log.debug('Greenlets killed')

            log.debug('Shutdown workers')
            # Shutdown workers - work should be completed by now...
            if self.is_single_worker:
                # Current work will be finished
                self.workers[0].stop()
            else:
                self.workers = self.factory.reload_instances()
                # CBM TODO:  THIS DOES NOT ALLOW CURRENT WORK TO FINISH!!!
                for x in self.workers:
                    self.workers[x].cleanup()
                self.factory.terminate()
            log.debug('Workers shutdown')
        except:
            raise
        finally:
            log.debug('Closing provisioner and receiver sockets')
            # Close sockets
            self.prov_sock.close()
            self.resp_sock.close()
            log.debug('Sockets closed')
            log.debug('Terminating the context')
            self.context.term()
            log.debug('Context terminated')

            self._shutdown = True

    def organize_work(self):
        while True:
            if self._do_stop and self.prep_queue.empty():
                break
            try:
                # Timeout after 1 second to allow stopage and _stashed_work cleanup
                wd = self.prep_queue.get(timeout=1)
            except queue.Empty:
                # No new work added - see if there's anything on the stash to cleanup...
                for k in self._stashed_work:
                    log.debug('Cleanup _stashed_work...')
                    # Just want to trigger cleanup of the _stashed_work, pass an empty list of 'work', gets discarded
                    self.put_work(k, self._stashed_work[k][0], [])
                continue

            try:
                k, wm, w = wd
                is_list = isinstance(w, list)
                if k not in self._stashed_work and len(w) == 0:
                    log.debug('Discarding empty work')
                    continue

                log.debug('Work: %s', w)

                if k in self._active_work:
                    log.debug('Do Stash')
                    # The work_key is being worked on
                    if k not in self._stashed_work:
                        # Create the stash for this work_key
                        self._stashed_work[k] = (wm, [])

                    # Add the work to the stash
                    if is_list:
                        self._stashed_work[k][1].extend(w[:])
                    else:
                        self._stashed_work[k][1].append(w)
                else:
                    # If there is a stash for this work_key, prepend it to work
                    if k in self._stashed_work:
                        log.debug('Was a stash, prepend: %s, %s',
                                  self._stashed_work[k], w)
                        _, sv = self._stashed_work.pop(k)
                        if is_list:
                            sv.extend(w[:])
                        else:
                            sv.append(w)
                        w = sv
                        is_list = True  # Work is a list going forward!!

                    log.debug('Work: %s', w)

                    # The work_key is not yet pending
                    not_in_pend = k not in self._pending_work

                    if not_in_pend:
                        # Create the pending for this work_key
                        log.debug('-> new pointer \'%s\'', k)
                        self._pending_work[k] = (wm, [])

                    # Add the work to the pending
                    log.debug('-> adding work to \'%s\': %s', k, w)
                    if is_list:
                        self._pending_work[k][1].extend(w[:])
                    else:
                        self._pending_work[k][1].append(w)

                    if not_in_pend:
                        # Add the not-yet-pending work to the work_queue
                        self.work_queue.put(k)
            except:
                raise

    def put_work(self, work_key, work_metrics, work):
        if self._shutdown:
            raise SystemError(
                'This BrickDispatcher has been shutdown and cannot process more work!'
            )
        self.prep_queue.put((work_key, work_metrics, work))

    def _add_failure(self, wp):
        pwp = pack(wp)
        log.warn('Adding to _failures: %s', pwp)
        if pwp in self._failures:
            self._failures[pwp] += 1
        else:
            self._failures[pwp] = 1

        if self._failures[pwp] > WORK_FAILURE_RETRIES:
            raise ValueError('Maximum failure retries exceeded')

    def receiver(self):
        while True:
            try:
                if self.resp_sock.closed:
                    break
                if self._do_stop and len(self._active_work) == 0:
                    break

                log.debug('Receive response message (loop)')
                msg = None
                while msg is None:
                    try:
                        msg = self.resp_sock.recv(zmq.NOBLOCK)
                    except zmq.ZMQError, e:
                        if e.errno == zmq.EAGAIN:
                            if self._do_stop:
                                break
                            else:
                                time.sleep(0.1)
                        else:
                            raise

                if msg is not None:
                    resp_type, worker_guid, work_key, work = unpack(msg)
                    work = list(work) if work is not None else work
                    if resp_type == SUCCESS:
                        log.debug('Worker %s was successful', worker_guid)
                        wguid, pw = self._active_work.pop(work_key)
                        if pw in self._failures:
                            self._failures.pop(pw)
                    elif resp_type == FAILURE:
                        log.debug(
                            'Failure reported for work on %s by worker %s',
                            work_key, worker_guid)
                        if work_key is None:
                            # Worker failed before it did anything, put all work back on the prep queue to be reorganized by the organizer
                            # Because it failed so miserably, need to find the work_key based on guid
                            for k, v in self._active_work.iteritems():
                                if v[0] == worker_guid:
                                    work_key = k
                                    break

                            if work_key is not None:
                                wguid, pw = self._active_work.pop(work_key)
                                try:
                                    self._add_failure(pw)
                                except ValueError, e:
                                    self._failure_callback(
                                        e.message, unpack(pw))
                                    continue

                                self.put_work(*unpack(pw))
                        else:
                            # Normal failure
                            # Pop the work from active work, and queue the work returned by the worker
                            wguid, pw = self._active_work.pop(work_key)
                            try:
                                self._add_failure(pw)
                            except ValueError, e:
                                self._failure_callback(e.message, unpack(pw))
                                continue
                            _, wm, wk = unpack(pw)
                            self.put_work(work_key, wm, work)
Example #4
0
class SupDExe(object):
    def __init__(self, eeagent_cfg, log=logging):
        self.log = log
        self.log.debug("Starting SupDExe")
        self._working_dir = eeagent_cfg.launch_type.supd_directory
        self._eename = eeagent_cfg.name
        supdexe = _set_param_or_default(eeagent_cfg.launch_type, "supdexe", None)
        self._slots = int(eeagent_cfg.slots)
        self._factory = SupDPidanticFactory(directory=self._working_dir, name=self._eename, supdexe=supdexe)
        pidantic_instances = self._factory.reload_instances()
        self._known_pws = {}
        for name in pidantic_instances:
            pidantic = pidantic_instances[name]
            pw = PidWrapper(self, name)
            pw.set_pidantic(pidantic)
            self._known_pws[name] = pw
        self._state_change_cb = None
        self._state_change_cb_arg = None

    def set_state_change_callback(self, cb, user_arg):
        self._state_change_cb = cb
        self._state_change_cb_arg = user_arg

        for name in self._known_pws:
            pw = self._known_pws["name"]
            pw.set_state_change_callback(self._state_change_cb, self._state_change_cb_arg)

    def run(self, name, parameters):
        pw = PidWrapper(self, name)
        self._known_pws[name] = pw
        command = parameters["exec"] + " " + " ".join(parameters["argv"])

        dir = self._working_dir
        if "working_directory" in parameters:
            dir = parameters["working_directory"]
        pid = self._factory.get_pidantic(command=command, process_name=name, directory=dir)
        pw.set_pidantic(pid)
        if self._state_change_cb:
            pw.set_state_change_callback(self._state_change_cb, self._state_change_cb_arg)

        running_jobs = self._get_running()
        x = len(running_jobs)
        if x <= self._slots:
            pid.start()
        else:
            pid.cancel_request()
        return pw

    def get_known_pws(self):
        return self._known_pws

    def _remove_proc(self, proc_name):
        del self._known_pws[proc_name]

    def lookup_id(self, process_name, ignore_round=False):

        if ignore_round:
            process_upid, process_round = unmake_id(process_name)
            for name, proc in self._known_pws.iteritems():
                upid, round = unmake_id(name)

                if process_upid == upid:
                    return proc
            else:
                return None

        else:
            if process_name not in self._known_pws:
                return None
            return self._known_pws[process_name]

    def get_all(self):
        return self._known_pws

    def _get_running(self):
        running_states = [PidWrapper.RUNNING, PidWrapper.TERMINATING, PidWrapper.PENDING]
        a = self.get_all().values()
        running = [i.get_state() for i in a]

        running = [i for i in a if i.get_state() in running_states]
        return running

    def poll(self):
        return self._factory.poll()

    def terminate(self):
        self._factory.terminate()
Example #5
0
class EPUHarness(object):
    """EPUHarness. Sets up Process Dispatchers and EEAgents for testing.
    """
    def __init__(self,
                 exchange=None,
                 pidantic_dir=None,
                 amqp_uri=None,
                 config=None,
                 sysname=None):

        configs = ["epuharness"]
        config_files = get_config_paths(configs)
        if config:
            config_files.append(config)
        self.CFG = bootstrap.configure(config_files)
        self.sysname = sysname

        self.logdir = self.CFG.epuharness.logdir
        self.pidantic_dir = (pidantic_dir
                             or os.environ.get('EPUHARNESS_PERSISTENCE_DIR')
                             or self.CFG.epuharness.pidantic_dir)
        self.exchange = exchange or self.CFG.server.amqp.get(
            'exchange', None) or str(uuid.uuid4())
        self.CFG.server.amqp.exchange = self.exchange
        self.CFG.dashi.sysname = sysname
        self.dashi = bootstrap.dashi_connect(self.CFG.dashi.topic,
                                             self.CFG,
                                             amqp_uri=amqp_uri,
                                             sysname=sysname)
        self.amqp_cfg = dict(self.CFG.server.amqp)

        self.factory = None
        self.savelogs_dir = None

    def _setup_factory(self):

        if self.factory:
            return

        try:
            self.factory = SupDPidanticFactory(directory=self.pidantic_dir,
                                               name="epu-harness")
        except Exception:
            log.debug("Problem Connecting to SupervisorD", exc_info=True)
            raise HarnessException(
                "Could not connect to supervisord. Was epu-harness started?")

    def status(self, exit=True):
        """Get status of services that were previously started by epuharness
        """

        self._setup_factory()

        instances = self.factory.reload_instances()
        self.factory.poll()
        return_code = 0
        status = []
        for name, instance in instances.iteritems():
            state = instance.get_state()
            status.append((name, status))
            if state != PIDanticState.STATE_RUNNING:
                return_code = 1

            log.info("%s is %s" % (name, instance.get_state()))
        if exit:
            sys.exit(return_code)
        else:
            return status

    def get_logfiles(self):
        """Returns a list of logfile paths relevant to epuharness instance
        """
        # pretty hacky. we could get these over the supd API instead.
        # but that's certainly slower and not really better.
        pidantic_dir = os.path.abspath(self.pidantic_dir)
        epuharness_dir = os.path.join(pidantic_dir, "epu-harness")

        logfiles = []
        for f in os.listdir(epuharness_dir):
            if os.path.splitext(f)[1].lower() == ".log":
                logfiles.append(os.path.join(epuharness_dir, f))
        return logfiles

    def stop(self, services=None, force=False, remove_dir=True):
        """Stop services that were previously started by epuharness

        @param force: When False raises an exception when there is something
                      that can't be killed.
        """
        cleanup = False

        self._setup_factory()
        instances = self.factory.reload_instances()

        # If we're killing everything, perform cleanup
        if services == instances.keys():
            cleanup = True
        elif not services:
            cleanup = True
            services = instances.keys()

        log.info("Stopping %s" % ", ".join(services))
        for service in services:
            instances_to_kill = filter(lambda x: x.startswith(service),
                                       instances.keys())
            for instance_name in instances_to_kill:
                instance = instances[instance_name]
                self._clean_instance_config(instance)
                if not cleanup:
                    instance.cleanup()

        if cleanup:
            if self.savelogs_dir:
                try:
                    self._save_logs(self.savelogs_dir)
                except Exception:
                    log.exception("Problem saving logs. Proceeding.")

            try:
                self.factory.terminate()
            except Exception as e:
                log.warning("Problem terminating factory, continuing : %s" % e)

            if remove_dir:
                careful_rmtree(self.pidantic_dir)

        self.dashi.cancel()
        self.dashi.disconnect()

    def _save_logs(self, output_dir):
        for logfile in self.get_logfiles():
            basename = os.path.basename(logfile)
            dest_path = os.path.join(output_dir, basename)
            try:
                shutil.copy2(logfile, dest_path)
            except Exception:
                log.exception("Error copying logfile %s", logfile)

    def _clean_instance_config(self, instance):
        try:
            # Clean up config files
            command = instance._program_object.command
            command = command.split()
            for config in command:
                if config.endswith('.yml'):
                    _cf = yaml.load(config)
                    with open(_cf) as cf:
                        cfg = yaml.load(cf)
                        try:
                            persistence = cfg['apps'][0]['config']['eeagent'][
                                'launch_type']['persistence_directory']
                            careful_rmtree(persistence)
                        except Exception:
                            pass
                    os.remove(config)
        except Exception, e:
            # Perhaps instance internals have changed
            log.warning("Couldn't delete temporary config files: %s" % e)
Example #6
0
class EPUHarness(object):
    """EPUHarness. Sets up Process Dispatchers and EEAgents for testing.
    """

    def __init__(self, exchange=None, pidantic_dir=None, amqp_uri=None, config=None, sysname=None):

        configs = ["epuharness"]
        config_files = get_config_paths(configs)
        if config:
            config_files.append(config)
        self.CFG = bootstrap.configure(config_files)
        self.sysname = sysname

        self.logdir = self.CFG.epuharness.logdir
        self.pidantic_dir = (pidantic_dir or
                os.environ.get('EPUHARNESS_PERSISTENCE_DIR') or
                self.CFG.epuharness.pidantic_dir)
        self.exchange = exchange or self.CFG.server.amqp.get('exchange', None) or str(uuid.uuid4())
        self.CFG.server.amqp.exchange = self.exchange
        self.CFG.dashi.sysname = sysname
        self.dashi = bootstrap.dashi_connect(self.CFG.dashi.topic, self.CFG, amqp_uri=amqp_uri, sysname=sysname)
        self.amqp_cfg = dict(self.CFG.server.amqp)

        self.factory = None
        self.savelogs_dir = None

    def _setup_factory(self):

        if self.factory:
            return

        try:
            self.factory = SupDPidanticFactory(directory=self.pidantic_dir,
                    name="epu-harness")
        except Exception:
            log.debug("Problem Connecting to SupervisorD", exc_info=True)
            raise HarnessException("Could not connect to supervisord. Was epu-harness started?")

    def status(self, exit=True):
        """Get status of services that were previously started by epuharness
        """

        self._setup_factory()

        instances = self.factory.reload_instances()
        self.factory.poll()
        return_code = 0
        status = []
        for name, instance in instances.iteritems():
            state = instance.get_state()
            status.append((name, status))
            if state != PIDanticState.STATE_RUNNING:
                return_code = 1

            log.info("%s is %s" % (name, instance.get_state()))
        if exit:
            sys.exit(return_code)
        else:
            return status

    def get_logfiles(self):
        """Returns a list of logfile paths relevant to epuharness instance
        """
        # pretty hacky. we could get these over the supd API instead.
        # but that's certainly slower and not really better.
        pidantic_dir = os.path.abspath(self.pidantic_dir)
        epuharness_dir = os.path.join(pidantic_dir, "epu-harness")

        logfiles = []
        for f in os.listdir(epuharness_dir):
            if os.path.splitext(f)[1].lower() == ".log":
                logfiles.append(os.path.join(epuharness_dir, f))
        return logfiles

    def stop(self, services=None, force=False, remove_dir=True):
        """Stop services that were previously started by epuharness

        @param force: When False raises an exception when there is something
                      that can't be killed.
        """
        cleanup = False

        self._setup_factory()
        instances = self.factory.reload_instances()

        # If we're killing everything, perform cleanup
        if services == instances.keys():
            cleanup = True
        elif not services:
            cleanup = True
            services = instances.keys()

        log.info("Stopping %s" % ", ".join(services))
        for service in services:
            instances_to_kill = filter(lambda x: x.startswith(service), instances.keys())
            for instance_name in instances_to_kill:
                instance = instances[instance_name]
                self._clean_instance_config(instance)
                if not cleanup:
                    instance.cleanup()

        if cleanup:
            if self.savelogs_dir:
                try:
                    self._save_logs(self.savelogs_dir)
                except Exception:
                    log.exception("Problem saving logs. Proceeding.")

            try:
                self.factory.terminate()
            except Exception as e:
                log.warning("Problem terminating factory, continuing : %s" % e)

            if remove_dir:
                careful_rmtree(self.pidantic_dir)

        self.dashi.cancel()
        self.dashi.disconnect()

    def _save_logs(self, output_dir):
        for logfile in self.get_logfiles():
            basename = os.path.basename(logfile)
            dest_path = os.path.join(output_dir, basename)
            try:
                shutil.copy2(logfile, dest_path)
            except Exception:
                log.exception("Error copying logfile %s", logfile)

    def _clean_instance_config(self, instance):
        try:
            # Clean up config files
            command = instance._program_object.command
            command = command.split()
            for config in command:
                if config.endswith('.yml'):
                    _cf = yaml.load(config)
                    with open(_cf) as cf:
                        cfg = yaml.load(cf)
                        try:
                            persistence = cfg['apps'][0]['config']['eeagent']['launch_type']['persistence_directory']
                            careful_rmtree(persistence)
                        except Exception:
                            pass
                    os.remove(config)
        except Exception, e:
            # Perhaps instance internals have changed
            log.warning("Couldn't delete temporary config files: %s" % e)
Example #7
0
class SupDExe(object):
    def __init__(self, eeagent_cfg, log=logging):
        self.log = log
        self.log.debug("Starting SupDExe")
        self._working_dir = eeagent_cfg.launch_type.supd_directory
        self._eename = eeagent_cfg.name
        supdexe = _set_param_or_default(eeagent_cfg.launch_type, 'supdexe',
                                        None)
        self._slots = int(eeagent_cfg.slots)
        self._factory = SupDPidanticFactory(directory=self._working_dir,
                                            name=self._eename,
                                            supdexe=supdexe)
        pidantic_instances = self._factory.reload_instances()
        self._known_pws = {}
        for name in pidantic_instances:
            pidantic = pidantic_instances[name]
            pw = PidWrapper(self, name)
            pw.set_pidantic(pidantic)
            self._known_pws[name] = pw
        self._state_change_cb = None
        self._state_change_cb_arg = None

    def set_state_change_callback(self, cb, user_arg):
        self._state_change_cb = cb
        self._state_change_cb_arg = user_arg

        for name in self._known_pws:
            pw = self._known_pws['name']
            pw.set_state_change_callback(self._state_change_cb,
                                         self._state_change_cb_arg)

    def run(self, name, parameters):
        pw = PidWrapper(self, name)
        self._known_pws[name] = pw
        command = parameters['exec'] + " " + " ".join(parameters['argv'])

        dir = self._working_dir
        if "working_directory" in parameters:
            dir = parameters["working_directory"]
        pid = self._factory.get_pidantic(command=command,
                                         process_name=name,
                                         directory=dir)
        pw.set_pidantic(pid)
        if self._state_change_cb:
            pw.set_state_change_callback(self._state_change_cb,
                                         self._state_change_cb_arg)

        running_jobs = self._get_running()
        x = len(running_jobs)
        if x <= self._slots:
            pid.start()
        else:
            pid.cancel_request()
        return pw

    def get_known_pws(self):
        return self._known_pws

    def _remove_proc(self, proc_name):
        del self._known_pws[proc_name]

    def lookup_id(self, process_name, ignore_round=False):

        if ignore_round:
            process_upid, process_round = unmake_id(process_name)
            for name, proc in self._known_pws.iteritems():
                upid, round = unmake_id(name)

                if process_upid == upid:
                    return proc
            else:
                return None

        else:
            if process_name not in self._known_pws:
                return None
            return self._known_pws[process_name]

    def get_all(self):
        return self._known_pws

    def _get_running(self):
        running_states = [
            PidWrapper.RUNNING, PidWrapper.TERMINATING, PidWrapper.PENDING
        ]
        a = self.get_all().values()
        running = [i.get_state() for i in a]

        running = [i for i in a if i.get_state() in running_states]
        return running

    def poll(self):
        return self._factory.poll()

    def terminate(self):
        self._factory.terminate()