Example #1
0
    def resourceOffers(self, driver, offers):
        tpn = self.options.task_per_node
        random.shuffle(offers)
        self.last_offer_time = time.time()
        if not self.total_tasks:
            driver.suppressOffers()
            for o in offers:
                driver.declineOffer(o.id)

            return

        for offer in offers:
            try:
                if conf.ban(offer.hostname):
                    logger.debug("skip offer on banned node: %s", offer.hostname)
                    continue
            except:
                logger.exception("bad ban() func in dpark.conf")

            attrs = self.getAttributes(offer)
            group = attrs.get('group', 'None')
            if (self.options.group or group.startswith(
                    '_')) and group not in self.options.group:
                driver.declineOffer(offer.id, REFUSE_FILTER)
                continue

            cpus, mem, gpus = self.getResource(offer)
            logger.debug('got resource offer %s: cpus:%s, mem:%s, gpus:%s at %s',
                         offer.id.value, cpus, mem, gpus, offer.hostname)
            sid = offer.agent_id.value
            tasks = []
            while (self.total_tasks and cpus >= self.cpus + EXECUTOR_CPUS and
                   mem >= self.mem + EXECUTOR_MEMORY and gpus >= self.gpus and (
                       tpn == 0 or tpn > 0 and
                       len(self.agentTasks.get(sid, set())) < tpn
                   )):
                logger.debug('Accepting slot on agent %s (%s)',
                             offer.agent_id.value, offer.hostname)
                t = self.total_tasks.pop()
                task = self.create_task(offer, t)
                tasks.append(task)
                t.state = 'TASK_STARTING'
                t.state_time = time.time()
                self.task_launched[t.id] = t
                self.agentTasks.setdefault(sid, set()).add(t.id)
                cpus -= self.cpus
                mem -= self.mem
                gpus -= self.gpus
                if not self.total_tasks:
                    break

            logger.debug(
                'dispatch %d tasks to agent %s',
                len(tasks),
                offer.hostname)
            driver.launchTasks(offer.id, tasks, REFUSE_FILTER)
Example #2
0
    def resourceOffers(self, driver, offers):
        rf = Dict()
        if not self.active_tasksets:
            driver.suppressOffers()
            rf.refuse_seconds = 60 * 5
            for o in offers:
                driver.declineOffer(o.id, rf)
            return

        start = time.time()
        filter_offer = []
        for o in offers:
            try:
                if conf.ban(o.hostname):
                    logger.debug("skip offer on banned node: %s", o.hostname)
                    continue
            except:
                logger.exception("bad ban() func in dpark.conf")

            group = (self.getAttribute(o.attributes, 'group') or 'None')
            if (self.group
                    or group.startswith('_')) and group not in self.group:
                driver.declineOffer(o.id,
                                    filters=Dict(refuse_seconds=0xFFFFFFFF))
                continue
            if self.task_host_manager.is_unhealthy_host(o.hostname):
                logger.warning('the host %s is unhealthy so skip it',
                               o.hostname)
                driver.declineOffer(o.id, filters=Dict(refuse_seconds=1800))
                continue
            self.task_host_manager.register_host(o.hostname)
            filter_offer.append(o)
        offers = filter_offer
        cpus = [self.getResource(o.resources, 'cpus') for o in offers]
        gpus = [self.getResource(o.resources, 'gpus') for o in offers]
        mems = [
            self.getResource(o.resources, 'mem') -
            (o.agent_id.value not in self.agent_id_to_ttids and EXECUTOR_MEMORY
             or 0) for o in offers
        ]
        # logger.debug('get %d offers (%s cpus, %s mem, %s gpus), %d tasksets',
        #             len(offers), sum(cpus), sum(mems), sum(gpus), len(self.active_tasksets))

        tasks = {}
        for taskset in self.active_tasksets.values():
            while True:
                host_offers = {}
                for i, o in enumerate(offers):
                    if self.agent_id_to_ttids.get(o.agent_id.value,
                                                  0) >= self.task_per_node:
                        logger.debug('the task limit exceeded at host %s',
                                     o.hostname)
                        continue
                    if (mems[i] < self.mem + EXECUTOR_MEMORY
                            or cpus[i] < self.cpus + EXECUTOR_CPUS):
                        continue
                    host_offers[o.hostname] = (i, o)
                assigned_list = taskset.taskOffer(host_offers, cpus, mems,
                                                  gpus)
                if not assigned_list:
                    break
                for i, o, t in assigned_list:
                    task = self.createTask(o, t)
                    tasks.setdefault(o.id.value, []).append(task)
                    logger.debug('dispatch %s into %s', t, o.hostname)
                    ttid = task.task_id.value
                    agent_id = o.agent_id.value
                    taskset.ttids.add(ttid)
                    self.ttid_to_agent_id[ttid] = agent_id
                    self.agent_id_to_ttids[
                        agent_id] = self.agent_id_to_ttids.get(agent_id, 0) + 1
                    cpus[i] -= min(cpus[i], t.cpus)
                    mems[i] -= t.mem
                    gpus[i] -= t.gpus

        used = time.time() - start
        if used > 10:
            logger.error('use too much time in resourceOffers: %.2fs', used)

        for o in offers:
            if o.id.value in tasks:
                driver.launchTasks(o.id, tasks[o.id.value])
            else:
                driver.declineOffer(o.id)
Example #3
0
    def resourceOffers(self, driver, offers):
        tpn = self.options.task_per_node
        random.shuffle(offers)
        self.last_offer_time = time.time()
        if not self.total_tasks:
            driver.suppressOffers()
            for o in offers:
                driver.declineOffer(o.id)

            return

        for offer in offers:
            try:
                if conf.ban(offer.hostname):
                    logger.debug("skip offer on banned node: %s", offer.hostname)
                    continue
            except:
                logger.exception("bad ban() func in dpark.conf")

            unavailability = offer.get('unavailability')
            if (unavailability is not None and
                    sec2nanosec(time.time() + conf.DEFAULT_TASK_TIME) >= unavailability['start']['nanoseconds']):
                logger.debug('the host %s plan to maintain, so skip it', offer.hostname)
                driver.declineOffer(offer.id, filters=Dict(refuse_seconds=600))
                continue
            attrs = self.getAttributes(offer)
            group = attrs.get('group', 'None')
            if (self.options.group or group.startswith(
                    '_')) and group not in self.options.group:
                driver.declineOffer(offer.id, REFUSE_FILTER)
                continue

            cpus, mem, gpus = self.getResource(offer)
            logger.debug('got resource offer %s: cpus:%s, mem:%s, gpus:%s at %s',
                         offer.id.value, cpus, mem, gpus, offer.hostname)
            sid = offer.agent_id.value
            tasks = []
            while (self.total_tasks and cpus >= self.cpus + EXECUTOR_CPUS and
                   mem >= self.mem + EXECUTOR_MEMORY and gpus >= self.gpus and (
                           tpn == 0 or tpn > 0 and
                           len(self.agentTasks.get(sid, set())) < tpn
                   )):
                logger.debug('Accepting slot on agent %s (%s)',
                             offer.agent_id.value, offer.hostname)
                t = self.total_tasks.pop()
                task = self.create_task(offer, t)
                tasks.append(task)
                t.state = 'TASK_STARTING'
                self.stats['submit_times'][t.id] = t.state_time = time.time()
                self.task_launched[t.id] = t
                self.agentTasks.setdefault(sid, set()).add(t.id)
                cpus -= self.cpus
                mem -= self.mem
                gpus -= self.gpus
                if not self.total_tasks:
                    break

            logger.debug(
                'dispatch %d tasks to agent %s',
                len(tasks),
                offer.hostname)
            driver.launchTasks(offer.id, tasks, REFUSE_FILTER)