Example #1
0
def getNodes(api, num, **constraints):
    # Now do the backtracking search for a suitable solution
    # First with existing slice nodes
    reqs = []
    nodes = []

    import node as Node

    for i in xrange(num):
        node = Node.Node(api)
        node.min_num_external_interface = 1
        nodes.append(node)

    node = nodes[0]
    candidates = filterBlacklist(node.find_candidates())
    reqs = [candidates] * num

    def pickbest(fullset, nreq, node=nodes[0]):
        if len(fullset) > nreq:
            fullset = zip(node.rate_nodes(fullset), fullset)
            fullset.sort(reverse=True)
            del fullset[nreq:]
            return set(map(operator.itemgetter(1), fullset))
        else:
            return fullset

    solution = resourcealloc.alloc(reqs, sample=pickbest)

    # Do assign nodes
    runner = ParallelRun(maxthreads=4)
    for node, node_id in zip(nodes, solution):
        runner.put(node.assign_node_id, node_id)
    runner.join()

    return nodes
Example #2
0
 def do_wait_nodes(self):
     for guid, node in self._elements.iteritems():
         if isinstance(node, self._node.Node):
             # Just inject configuration stuff
             node.home_path = "nepi-node-%s" % (guid,)
             node.ident_path = self.sliceSSHKey
             node.slicename = self.slicename
         
             # Show the magic
             self._logger.info("PlanetLab Node %s configured at %s", guid, node.hostname)
     
     try:
         runner = ParallelRun(maxthreads=64, maxqueue=1)
         abort = []
         def waitforit(guid, node):
             try:
                 node.wait_provisioning(
                     (20*60 if node._node_id in self._just_provisioned else 60)
                 )
                 
                 self._logger.info("READY Node %s at %s", guid, node.hostname)
                 
                 # Prepare dependency installer now
                 node.prepare_dependencies()
             except:
                 abort.append(None)
                 raise
             
         for guid, node in self._elements.iteritems():
             if abort:
                 break
             if isinstance(node, self._node.Node):
                 self._logger.info("Waiting for Node %s configured at %s", guid, node.hostname)
                 runner.put(waitforit, guid, node)
         runner.join()
                 
     except self._node.UnresponsiveNodeError:
         # Uh... 
         self._logger.warn("UNRESPONSIVE Nodes")
         
         # Mark all dead nodes (which are unresponsive) on the blacklist
         # and re-raise
         for guid, node in self._elements.iteritems():
             if isinstance(node, self._node.Node):
                 if not node.is_alive():
                     self._logger.warn("Blacklisting %s for unresponsiveness", node.hostname)
                     self._blacklist.add(node.hostname)
                     node.unassign_node()
         
         try:
             self._save_blacklist()
         except:
             # not important...
             import traceback
             traceback.print_exc()
         
         raise
Example #3
0
    def test_run_simple(self):
        runner = ParallelRun(maxthreads=4)
        runner.start()

        count = [0]

        def inc(count):
            count[0] += 1

        for x in xrange(10):
            runner.put(inc, count)

        runner.destroy()

        self.assertEquals(count[0], 10)
Example #4
0
    def test_run_interrupt(self):
        def sleep():
            import time
            time.sleep(5)

        startt = datetime.datetime.now()

        runner = ParallelRun(maxthreads=4)
        runner.start()

        for x in xrange(100):
            runner.put(sleep)

        runner.empty()
        runner.destroy()

        endt = datetime.datetime.now()
        time_elapsed = (endt - startt).seconds
        self.assertTrue(time_elapsed < 500)
Example #5
0
    def test_run_error(self):
        count = [0]

        def inc(count):
            count[0] += 1

        def error():
            raise RuntimeError()

        runner = ParallelRun(maxthreads=4)
        runner.start()

        for x in xrange(4):
            runner.put(inc, count)

        runner.put(error)

        runner.destroy()

        self.assertEquals(count[0], 4)

        self.assertRaises(RuntimeError, runner.sync)
Example #6
0
    def _process(self):
        """ Process scheduled tasks.

        .. note::
        
        Tasks are scheduled by invoking the schedule method with a target 
        callback and an execution time. 
        The schedule method creates a new Task object with that callback 
        and execution time, and pushes it into the '_scheduler' queue. 
        The execution time and the order of arrival of tasks are used 
        to order the tasks in the queue.

        The _process method is executed in an independent thread held by 
        the ExperimentController for as long as the experiment is running.
        This method takes tasks from the '_scheduler' queue in a loop 
        and processes them in parallel using multithreading. 
        The environmental variable NEPI_NTHREADS can be used to control
        the number of threads used to process tasks. The default value is 
        50.

        To execute tasks in parallel, a ParallelRunner (PR) object is used.
        This object keeps a pool of threads (workers), and a queue of tasks
        scheduled for 'immediate' execution. 
        
        On each iteration, the '_process' loop will take the next task that 
        is scheduled for 'future' execution from the '_scheduler' queue, 
        and if the execution time of that task is >= to the current time, 
        it will push that task into the PR for 'immediate execution'. 
        As soon as a worker is free, the PR will assign the next task to
        that worker.

        Upon receiving a task to execute, each PR worker (thread) will 
        invoke the  _execute method of the EC, passing the task as 
        argument.         
        The _execute method will then invoke task.callback inside a 
        try/except block. If an exception is raised by the tasks.callback, 
        it will be trapped by the try block, logged to standard error 
        (usually the console), and the task will be marked as failed.

        """

        self._nthreads = int(
            os.environ.get("NEPI_NTHREADS", str(self._nthreads)))
        self._runner = ParallelRun(maxthreads=self.nthreads)
        self._runner.start()

        while not self._stop:
            try:
                self._cond.acquire()

                task = self._scheduler.next()

                if not task:
                    # No task to execute. Wait for a new task to be scheduled.
                    self._cond.wait()
                else:
                    # The task timestamp is in the future. Wait for timeout
                    # or until another task is scheduled.
                    now = tnow()
                    if now < task.timestamp:
                        # Calculate timeout in seconds
                        timeout = tdiffsec(task.timestamp, now)

                        # Re-schedule task with the same timestamp
                        self._scheduler.schedule(task)

                        task = None

                        # Wait timeout or until a new task awakes the condition
                        self._cond.wait(timeout)

                self._cond.release()

                if task:
                    # Process tasks in parallel
                    self._runner.put(self._execute, task)
            except:
                import traceback
                err = traceback.format_exc()
                self.logger.error(
                    "Error while processing tasks in the EC: %s" % err)

                # Set the EC to FAILED state
                self._state = ECState.FAILED

                # Set the FailureManager failure level to EC failure
                self._fm.set_ec_failure()

        self.logger.debug("Exiting the task processing loop ... ")

        self._runner.sync()
        self._runner.destroy()
Example #7
0
    def _do_in_factory_order(self,
                             action,
                             order,
                             postaction=None,
                             poststep=None):
        logger = self._logger

        guids = collections.defaultdict(list)
        # order guids (elements) according to factory_id
        for guid, factory_id in self._create.iteritems():
            guids[factory_id].append(guid)

        # configure elements following the factory_id order
        for factory_id in order:
            # Create a parallel runner if we're given a Parallel() wrapper
            runner = None
            if isinstance(factory_id, Parallel):
                runner = ParallelRun(factory_id.maxthreads)
                factory_id = factory_id.factory

            # omit the factories that have no element to create
            if factory_id not in guids:
                continue

            # configure action
            factory = self._factories[factory_id]
            if isinstance(action, basestring) and not getattr(factory, action):
                continue

            def perform_action(guid):
                if isinstance(action, basestring):
                    getattr(factory, action)(self, guid)
                else:
                    action(self, guid)
                if postaction:
                    postaction(self, guid)

            # perform the action on all elements, in parallel if so requested
            if runner:
                logger.debug("TestbedController: Starting parallel %s", action)
                runner.start()

            for guid in guids[factory_id]:
                if runner:
                    logger.debug("TestbedController: Scheduling %s on %s",
                                 action, guid)
                    runner.put(perform_action, guid)
                else:
                    logger.debug("TestbedController: Performing %s on %s",
                                 action, guid)
                    perform_action(guid)

            # sync
            if runner:
                runner.sync()

            # post hook
            if poststep:
                for guid in guids[factory_id]:
                    if runner:
                        logger.debug(
                            "TestbedController: Scheduling post-%s on %s",
                            action, guid)
                        runner.put(poststep, self, guid)
                    else:
                        logger.debug(
                            "TestbedController: Performing post-%s on %s",
                            action, guid)
                        poststep(self, guid)

            # sync
            if runner:
                runner.join()
                logger.debug("TestbedController: Finished parallel %s", action)
Example #8
0
    def do_resource_discovery(self, recover = False):
        to_provision = self._to_provision = set()
        
        reserved = set(self._blacklist)
        for guid, node in self._elements.iteritems():
            if isinstance(node, self._node.Node) and node._node_id is not None:
                reserved.add(node.hostname)
        
        # Initial algo:
        #   look for perfectly defined nodes
        #   (ie: those with only one candidate)
        reserve_lock = threading.RLock()
        def assignifunique(guid, node):
            # Try existing nodes first
            # If we have only one candidate, simply use it
            candidates = node.find_candidates(
                filter_slice_id = self.slice_id)
            
            node_id = None
            candidate_hosts = set(candidates.keys() if candidates else [])
            reserve_lock.acquire()
            try:
                candidate_hosts -= reserved
                if len(candidate_hosts) == 1:
                    hostname = iter(candidate_hosts).next()
                    node_id = candidates[hostname]
                    reserved.add(hostname)
                elif not candidate_hosts:
                    # Try again including unassigned nodes
                    reserve_lock.release()
                    try:
                        candidates = node.find_candidates()
                    finally:
                        reserve_lock.acquire()
                    candidate_hosts = set(candidates.keys() if candidates else [])
                    candidate_hosts -= reserved
                    if len(candidate_hosts) > 1:
                        return
                    if len(candidate_hosts) == 1:
                        hostname = iter(candidate_hosts).next()
                        node_id = candidates[hostname]
                        to_provision.add(node_id)
                        reserved.add(hostname)
                    elif not candidates:
                        raise RuntimeError, "Cannot assign resources for node %s, no candidates with %s" % (guid,
                            node.make_filter_description())
            finally:
                reserve_lock.release()
           
            if node_id is not None:
                node.assign_node_id(node_id)
        
        runner = ParallelRun(maxthreads=4) # don't overload the PLC API, just 4 threads to hide latencies and that's it
        runner.start()
        for guid, node in self._elements.iteritems():
            if isinstance(node, self._node.Node) and node._node_id is None:
                runner.put(assignifunique, guid, node)
        runner.sync()
        
        # Now do the backtracking search for a suitable solution
        # First with existing slice nodes
        reqs = []
        nodes = []
        def genreqs(node, filter_slice_id=None):
            # Try existing nodes first
            # If we have only one candidate, simply use it
            candidates = node.find_candidates(
                filter_slice_id = filter_slice_id)
            for r in reserved:
                if candidates.has_key(r):
                    del candidates[r]
            reqs.append(candidates.values())
            nodes.append(node)
        for guid, node in self._elements.iteritems():
            if isinstance(node, self._node.Node) and node._node_id is None:
                runner.put(genreqs, node, self.slice_id)
        runner.sync()
       
        if nodes and reqs:
            if recover:
                raise RuntimeError, "Impossible to recover: unassigned host for Nodes %r" % (nodes,)

            def pickbest(fullset, nreq, node=nodes[0]):
                if len(fullset) > nreq:
                    fullset = zip(node.rate_nodes(fullset),fullset)
                    fullset.sort(reverse=True)
                    del fullset[nreq:]
                    return set(map(operator.itemgetter(1),fullset))
                else:
                    return fullset
            
            try:
                solution = resourcealloc.alloc(reqs, sample=pickbest)
            except resourcealloc.ResourceAllocationError:
                # Failed, try again with all nodes
                reqs = []
                for node in nodes:
                    runner.put(genreqs, node)
                runner.sync()
                solution = resourcealloc.alloc(reqs, sample=pickbest)
                to_provision.update(solution)
            
            # Do assign nodes
            for node, node_id in zip(nodes, solution):
                runner.put(node.assign_node_id, node_id)
            runner.join()