def statusUpdate(self, driver, update) -> None:
        self._driver = driver

        task_id = update.task_id.value
        task_state = str(update.state)
        log.info(f"Task update {task_state} received for task {task_id}")

        if task_id not in self.task_metadata:
            # We assume that a terminal status update has been
            # received for this task already.
            log.info('Ignoring this status update because a terminal status '
                     'update has been received for this task already.')
            self.call_driver('acknowledgeStatusUpdate', update)
            return

        md = self.task_metadata[task_id]

        # If we attempt to accept an offer that has been invalidated by
        # master for some reason such as offer has been rescinded or we
        # have exceeded offer_timeout, then we will get TASK_LOST status
        # update back from mesos master.
        if task_state == 'TASK_LOST' and str(update.reason) == \
                'REASON_INVALID_OFFERS':
            # This task has not been launched. Therefore, we are going to
            # reenqueue it. We are not propogating any event up to the
            # application.
            log.warning('Received TASK_LOST from mesos master because we '
                        'attempted to accept an invalid offer. Going to '
                        f're-enqueue this task {task_id}')
            # Re-enqueue task
            self.enqueue_task(md.task_config)
            get_metric(metrics.TASK_LOST_DUE_TO_INVALID_OFFER_COUNT).count(1)
            self.call_driver('acknowledgeStatusUpdate', update)
            return

        # Record state changes, send a new event and emit metrics only if the
        # task state has actually changed.
        if md.task_state != task_state:
            with self._lock:
                self.task_metadata = self.task_metadata.set(
                    task_id,
                    md.set(
                        task_state=task_state,
                        task_state_history=md.task_state_history.set(
                            task_state, time.time()),
                    )
                )

            self.event_queue.put(
                self.callbacks.handle_status_update(update, md.task_config),
            )

            if task_state in self._terminal_task_counts:
                with self._lock:
                    self.task_metadata = self.task_metadata.discard(task_id)
                get_metric(self._terminal_task_counts[task_state]).count(1)

        # We have to do this because we are not using implicit
        # acknowledgements.
        self.call_driver('acknowledgeStatusUpdate', update)
    def blacklist_slave(self, agent_id, timeout):
        with self._lock:
            # A new entry is appended even if the agent is being blacklisted.
            # This is equivalent to restarting the blacklist timer.
            log.info(f'Blacklisting slave: {agent_id} for {timeout} seconds.')
            self.blacklisted_slaves = self.blacklisted_slaves.append(agent_id)
            get_metric(metrics.BLACKLISTED_AGENTS_COUNT).count(1)

        unblacklist_thread = threading.Thread(
            target=self.unblacklist_slave,
            kwargs={'timeout': timeout, 'agent_id': agent_id},
        )
        unblacklist_thread.daemon = True
        unblacklist_thread.start()
    def _background_check(self):
        while True:
            if self.stopping:
                return

            time_now = time.time()
            with self._lock:
                for task_id in self.task_metadata.keys():
                    md = self.task_metadata[task_id]

                    if md.task_state not in ('TASK_STAGING', 'UNKNOWN'):
                        continue

                    # Task is not eligible for killing or reenqueuing
                    if time_now < (md.task_state_history[md.task_state] +
                                   self.task_staging_timeout_s):
                        continue

                    if md.task_state == 'UNKNOWN':
                        log.warning(
                            'Task {id} has been in unknown state for '
                            'longer than {timeout}. Re-enqueuing it.'.format(
                                id=task_id,
                                timeout=self.task_staging_timeout_s))
                        # Re-enqueue task
                        self.enqueue_task(md.task_config)
                        get_metric(TASK_FAILED_TO_LAUNCH_COUNT).count(1)
                        continue

                    else:
                        log.warning(
                            'Killing stuck task {id}'.format(id=task_id))
                        self.kill_task(task_id)
                        self.blacklist_slave(
                            agent_id=self.task_metadata[task_id].agent_id,
                            timeout=self.slave_blacklist_timeout_s,
                        )
                        get_metric(TASK_STUCK_COUNT).count(1)

            self._reconcile_tasks([
                Dict({'task_id': Dict({'value': task_id})})
                for task_id in self.task_metadata
                if self.task_metadata[task_id].task_state is not 'TASK_INITED'
            ])
            time.sleep(10)
    def enqueue_task(self, task_config):
        with self._lock:
            # task_state and task_state_history get reset every time
            # a task is enqueued.
            self.task_metadata = self.task_metadata.set(
                task_config.task_id,
                TaskMetadata(
                    task_config=task_config,
                    task_state='TASK_INITED',
                    task_state_history=m(TASK_INITED=time.time()),
                ))
            # Need to lock on task_queue to prevent enqueues when getting
            # tasks to launch
            self.task_queue.put(task_config)

        if self.are_offers_suppressed:
            self.driver.reviveOffers()
            self.are_offers_suppressed = False
            log.info('Reviving offers because we have tasks to run.')

        get_metric(TASK_ENQUEUED_COUNT).count(1)
    def launch_tasks_for_offer(self, offer, tasks_to_launch) -> bool:
        mesos_protobuf_tasks = [
            self.callbacks.make_mesos_protobuf(
                task_config, offer.agent_id.value, self.role)
            for task_config in tasks_to_launch
            if task_config.task_id in self.task_metadata
        ]
        if not mesos_protobuf_tasks:
            return False

        launched = True
        launch_time = time.time()
        if self.call_driver('launchTasks', offer.id, mesos_protobuf_tasks) is self.driver_error:
            tasks = ', '.join(task.task_id for task in tasks_to_launch)
            log.warning(
                f'Failed to launch: {tasks}, moving them to UNKNOWN state')
            get_metric(metrics.TASK_LAUNCH_FAILED_COUNT).count(1)
            launched = False

        # 'UNKNOWN' state is for internal tracking. It will not be
        # propogated to users.
        current_task_state = 'TASK_STAGING' if launched else 'UNKNOWN'

        for task in tasks_to_launch:
            md = self.task_metadata.get(task.task_id)
            if not md:
                log.warning(
                    f'trying to launch task {task.task_id}, but it is not in task metadata.'
                    f'current keys in task_metadata: {self.task_metadata.keys()}'
                )
                continue
            self.task_metadata = self.task_metadata.set(
                task.task_id,
                md.set(
                    task_state=current_task_state,
                    task_state_history=md.task_state_history.set(
                        current_task_state, launch_time),
                    agent_id=str(offer.agent_id.value),
                )
            )

            get_metric(metrics.TASK_QUEUED_TIME_TIMER).record(
                launch_time - md.task_state_history['TASK_INITED']
            )

            # Emit the staging event for successful launches
            if launched:
                self.event_queue.put(
                    self.callbacks.handle_status_update(
                        Dict(state='TASK_STAGING', offer=offer),
                        md.task_config,
                    )
                )
                get_metric(metrics.TASK_LAUNCHED_COUNT).count(1)

        return launched
    def _background_check(self):
        while True:
            if self.stopping:
                return

            time_now = time.time()
            tasks_to_reconcile = []
            with self._lock:
                for task_id, md in self.task_metadata.items():
                    self._background_check_task(
                        time_now,
                        tasks_to_reconcile,
                        task_id,
                        md,
                    )

            self._reconcile_tasks(
                [Dict({'task_id': Dict({'value': task_id})}) for
                    task_id in tasks_to_reconcile]
            )
            elapsed = time.time() - time_now
            log.info(f'background check done in {elapsed}s')
            get_metric(metrics.BGCHECK_TIME_TIMER).record(elapsed)
            time.sleep(10)
def emit_counter_metric(counter_name, service, instance):
    create_counter(counter_name, {'service': service, 'instance': instance})
    get_metric(counter_name).count(1)
    def resourceOffers(self, driver, offers) -> None:
        self._driver = driver

        current_offer_time = time.time()
        if self._last_offer_time is not None:
            get_metric(metrics.OFFER_DELAY_TIMER).record(
                current_offer_time - self._last_offer_time
            )
        self._last_offer_time = current_offer_time

        # Give user some time to enqueue tasks
        if self.task_queue.empty() and current_offer_time < self.decline_after:
            time.sleep(self.decline_after - current_offer_time)

        declined: dict = defaultdict(list)
        declined_offer_ids = []
        accepted = []

        with self._lock:
            if self.task_queue.empty():
                # Always suppress offers when there is nothing to run
                if self.call_driver('suppressOffers') is not self.driver_error:
                    self.are_offers_suppressed = True
                    log.info("Suppressing offers, no more tasks to run.")

                for offer in offers:
                    declined['no tasks'].append(offer.id.value)
                    declined_offer_ids.append(offer.id)

                self.call_driver(
                    'declineOffer', declined_offer_ids, self.offer_decline_filter)
                log.info(
                    f"Offers declined because of no tasks: {','.join(declined['no tasks'])}")
                return

        with_maintenance_window = [
            offer for offer in offers if offer.unavailability
        ]

        for offer in with_maintenance_window:
            start_time = offer.unavailability.start['nanoseconds']
            completion_time = int(
                (start_time + offer.unavailability.duration['nanoseconds'])
                / 1000000000
            )
            now = int(time.time())
            duration = completion_time - now
            if duration > 0:
                self.blacklist_slave(
                    agent_id=offer.agent_id.value,
                    timeout=duration,
                )

        without_maintenance_window = [
            offer for offer in offers if offer not in with_maintenance_window
        ]
        for offer in without_maintenance_window:
            with self._lock:
                if offer.agent_id.value in self.blacklisted_slaves:
                    declined['blacklisted'].append(
                        f'offer {offer.id.value} agent {offer.agent_id.value}'
                    )
                    declined_offer_ids.append(offer.id)
                    continue

            offer_pool_match, offer_pool = self.offer_matches_pool(offer)
            if not offer_pool_match:
                log.info(
                    f"Declining offer {offer.id.value}, required pool "
                    f"{self.pool} doesn't match offered pool {offer_pool}"
                )
                declined['bad pool'].append(offer.id.value)
                declined_offer_ids.append(offer.id)
                continue

            # Need to lock here even though we are only reading the task_queue, since
            # we are predicating on the queue's emptiness. If not locked, other
            # threads can continue enqueueing, and we never terminate the loop.
            task_configs = []
            with self._lock:
                while not self.task_queue.empty():
                    task_configs.append(self.task_queue.get())

                offer_resources = get_offer_resources(offer, self.role)
                offer_attributes = {
                    attribute.name: attribute.text.value
                    for attribute in offer.attributes
                }
                log.info(
                    f'Received offer {offer.id.value} for role {self.role}: {offer_resources}')
                tasks_to_launch, tasks_to_defer = self.callbacks.get_tasks_for_offer(
                    task_configs,
                    offer_resources,
                    offer_attributes,
                    self.role,
                )

                for task in tasks_to_defer:
                    self.task_queue.put(task)
                get_metric(metrics.TASK_INSUFFICIENT_OFFER_COUNT).count(len(tasks_to_defer))

                if len(tasks_to_launch) == 0:
                    declined['bad resources'].append(offer.id.value)
                    declined_offer_ids.append(offer.id)
                    continue

                ignored_tasks = ','.join(
                    task_config.task_id
                    for task_config in tasks_to_launch
                    if task_config.task_id not in self.task_metadata
                )
                if ignored_tasks:
                    log.warning(
                        f'ignoring tasks not in metadata: {ignored_tasks}')

                tasks_to_launch = [
                    task_config
                    for task_config in tasks_to_launch
                    if task_config.task_id in self.task_metadata
                ]

                if len(tasks_to_launch) == 0:
                    declined['nothing to launch'].append(offer.id.value)
                    declined_offer_ids.append(offer.id)
                elif not self.launch_tasks_for_offer(offer, tasks_to_launch):
                    declined['launch failed'].append(offer.id.value)
                    declined_offer_ids.append(offer.id)
                else:
                    accepted.append(
                        f'offer: {offer.id.value} '
                        f'agent: {offer.agent_id.value} '
                        f'tasks: {len(tasks_to_launch)}'
                    )

        if len(declined_offer_ids) > 0:
            self.call_driver(
                'declineOffer', declined_offer_ids, self.offer_decline_filter
            )
        for reason, items in declined.items():
            log.info(f"Offers declined because {reason}: {', '.join(items)}")
        if accepted:
            log.info(f"Offers accepted: {', '.join(accepted)}")
    def _background_check_task(self, time_now, tasks_to_reconcile, task_id, md):
        if md.task_state != 'TASK_INITED':
            tasks_to_reconcile.append(task_id)

        if md.task_state == 'TASK_INITED':
            # give up if the task hasn't launched after
            # offer_timeout
            inited_at = md.task_state_history['TASK_INITED']
            offer_timeout = md.task_config.offer_timeout
            expires_at = inited_at + offer_timeout
            if time_now >= expires_at:
                log.warning(
                    f'Task {task_id} has been waiting for offers '
                    'for longer than configured timeout '
                    f'{offer_timeout}. Giving up and removing the '
                    'task from the task queue.'
                )
                # killing the task will also remove them from the queue
                self.kill_task(task_id)
                # we are not expecting mesos to send terminal update
                # for this task, so cleaning it up manually
                self.task_metadata = self.task_metadata.discard(
                    task_id
                )
                self.event_queue.put(
                    task_event(
                        task_id=task_id,
                        terminal=True,
                        timestamp=time_now,
                        success=False,
                        message='stop',
                        task_config=md.task_config,
                        raw='Failed due to offer timeout',
                    )
                )
                get_metric(metrics.TASK_OFFER_TIMEOUT).count(1)

        # Task is not eligible for killing or reenqueuing
        in_current_state_since = md.task_state_history[md.task_state]
        if time_now < in_current_state_since + self.task_staging_timeout_s:
            return

        if md.task_state == 'UNKNOWN':
            log.warning(
                f'Re-enqueuing task {task_id} in unknown state for '
                f'longer than {self.task_staging_timeout_s}'
            )
            # Re-enqueue task
            self.enqueue_task(md.task_config)
            get_metric(
                metrics.TASK_FAILED_TO_LAUNCH_COUNT).count(1)
        elif md.task_state == 'TASK_STAGING':
            log.warning(f'Killing stuck task {task_id}')
            self.kill_task(task_id)
            self.task_metadata = self.task_metadata.set(
                task_id,
                md.set(
                    task_state='TASK_STUCK',
                    task_state_history=md.task_state_history.set(
                        'TASK_STUCK', time_now),
                )
            )
            self.blacklist_slave(
                agent_id=self.task_metadata[task_id].agent_id,
                timeout=self.slave_blacklist_timeout_s,
            )
            get_metric(metrics.TASK_STUCK_COUNT).count(1)
        elif md.task_state == 'TASK_STUCK':
            t = time.time()
            # 10s since last iteration + time we spent in current one
            time_delta = 10 + t - time_now
            # seconds since task was put in TASK_STUCK state
            time_stuck = t - md.task_state_history['TASK_STUCK']
            # seconds since `time_stuck` crossed another hour
            # boundary
            hour_rolled = time_stuck % 3600

            # if `time_stuck` crossed hour boundary since last
            # background check - lets re-send kill request
            if hour_rolled < time_delta:
                hours_stuck = time_stuck // 3600
                log.warning(
                    f'Task {task_id} is stuck, waiting for terminal '
                    f'state for {hours_stuck}h, sending another kill'
                )
                self.kill_task(task_id)
    def resourceOffers(self, driver, offers):
        if self.driver is None:
            self.driver = driver

        current_offer_time = time.time()
        if self._last_offer_time is not None:
            get_metric(OFFER_DELAY_TIMER).record(current_offer_time -
                                                 self._last_offer_time)
        self._last_offer_time = current_offer_time

        # Give user some time to enqueue tasks
        if self.task_queue.empty() and current_offer_time < self.decline_after:
            time.sleep(self.decline_after - current_offer_time)

        declined = {
            'blacklisted': [],
            'bad pool': [],
            'bad resources': [],
            'no tasks': []
        }
        declined_offer_ids = []
        accepted = []

        if self.task_queue.empty():
            if not self.are_offers_suppressed:
                driver.suppressOffers()
                self.are_offers_suppressed = True
                log.info("Suppressing offers, no more tasks to run.")

            for offer in offers:
                declined['no tasks'].append(offer.id.value)
                declined_offer_ids.append(offer.id)

            driver.declineOffer(declined_offer_ids, self.offer_decline_filter)
            log.info("Offers declined because of no tasks: {}".format(','.join(
                declined['no tasks'])))
            return

        with_maintenance_window = [
            offer for offer in offers if offer.unavailability
        ]

        for offer in with_maintenance_window:
            start_time = offer.unavailability.start['nanoseconds']
            completion_time = int(
                (start_time + offer.unavailability.duration['nanoseconds']) /
                1000000000)
            now = int(time.time())
            duration = completion_time - now
            if duration > 0:
                self.blacklist_slave(
                    agent_id=offer.agent_id.value,
                    timeout=duration,
                )

        without_maintenance_window = [
            offer for offer in offers if offer not in with_maintenance_window
        ]
        for offer in without_maintenance_window:
            with self._lock:
                if offer.agent_id.value in self.blacklisted_slaves:
                    declined['blacklisted'].append('offer {} agent {}'.format(
                        offer.id.value, offer.agent_id.value))
                    declined_offer_ids.append(offer.id)
                    continue

            if not self.offer_matches_pool(offer):
                log.info("Declining offer {id} because it is not for pool "
                         "{pool}.".format(id=offer.id.value, pool=self.pool))
                declined['bad pool'].append(offer.id.value)
                declined_offer_ids.append(offer.id)
                continue

            tasks_to_launch = self.get_tasks_to_launch(offer)

            if len(tasks_to_launch) == 0:
                if self.task_queue.empty():
                    if offer.id.value not in declined['no tasks']:
                        declined['no tasks'].append(offer.id.value)
                else:
                    declined['bad resources'].append(offer.id.value)
                declined_offer_ids.append(offer.id)
                continue

            accepted.append('offer: {} agent: {} tasks: {}'.format(
                offer.id.value, offer.agent_id.value, len(tasks_to_launch)))

            task_launch_failed = False
            try:
                driver.launchTasks(offer.id, tasks_to_launch)
            except (socket.timeout, Exception):
                log.warning('Failed to launch following tasks {tasks}.'
                            'Thus, moving them to UNKNOWN state'.format(
                                tasks=', '.join([
                                    task.task_id.value
                                    for task in tasks_to_launch
                                ]), ))
                task_launch_failed = True
                get_metric(TASK_LAUNCH_FAILED_COUNT).count(1)

            # 'UNKNOWN' state is for internal tracking. It will not be
            # propogated to users.
            current_task_state = 'UNKNOWN' if task_launch_failed else \
                'TASK_STAGING'
            with self._lock:
                for task in tasks_to_launch:
                    md = self.task_metadata[task.task_id.value]
                    self.task_metadata = self.task_metadata.set(
                        task.task_id.value,
                        md.set(
                            task_state=current_task_state,
                            task_state_history=md.task_state_history.set(
                                current_task_state, time.time()),
                        ))
                    if not task_launch_failed:
                        get_metric(TASK_LAUNCHED_COUNT).count(1)

        if len(declined_offer_ids) > 0:
            driver.declineOffer(declined_offer_ids, self.offer_decline_filter)
        for reason, items in declined.items():
            if items:
                log.info("Offers declined because of {}: {}".format(
                    reason, ', '.join(items)))
        if accepted:
            log.info("Offers accepted: {}".format(', '.join(accepted)))
    def get_tasks_to_launch(self, offer):
        tasks_to_launch = []
        remaining_cpus = 0
        remaining_mem = 0
        remaining_disk = 0
        remaining_gpus = 0
        available_ports = []
        for resource in offer.resources:
            if resource.name == "cpus" and resource.role == self.role:
                remaining_cpus += resource.scalar.value
            elif resource.name == "mem" and resource.role == self.role:
                remaining_mem += resource.scalar.value
            elif resource.name == "disk" and resource.role == self.role:
                remaining_disk += resource.scalar.value
            elif resource.name == "gpus" and resource.role == self.role:
                remaining_gpus += resource.scalar.value
            elif resource.name == "ports" and resource.role == self.role:
                # TODO: Validate if the ports available > ports required
                available_ports = self.get_available_ports(resource)

        log.info("Received offer {id} with cpus: {cpu}, mem: {mem}, "
                 "disk: {disk} gpus: {gpu} role: {role}".format(
                     id=offer.id.value,
                     cpu=remaining_cpus,
                     mem=remaining_mem,
                     disk=remaining_disk,
                     gpu=remaining_gpus,
                     role=self.role))

        tasks_to_put_back_in_queue = []

        # Need to lock here even though we are working on the task_queue, since
        # we are predicating on the queue's emptiness. If not locked, other
        # threads can continue enqueueing, and we never terminate the loop.
        with self._lock:
            # Get all the tasks of the queue
            while not self.task_queue.empty():
                task = self.task_queue.get()

                if ((remaining_cpus >= task.cpus and remaining_mem >= task.mem
                     and remaining_disk >= task.disk
                     and remaining_gpus >= task.gpus
                     and len(available_ports) > 0)):
                    # This offer is sufficient for us to launch task
                    tasks_to_launch.append(
                        self.create_new_docker_task(offer, task,
                                                    available_ports))

                    # Deduct the resources taken by this task from the total
                    # available resources.
                    remaining_cpus -= task.cpus
                    remaining_mem -= task.mem
                    remaining_disk -= task.disk
                    remaining_gpus -= task.gpus

                    md = self.task_metadata[task.task_id]
                    get_metric(TASK_QUEUED_TIME_TIMER).record(
                        time.time() - md.task_state_history['TASK_INITED'])
                else:
                    # This offer is insufficient for this task. We need to put
                    # it back in the queue
                    tasks_to_put_back_in_queue.append(task)

        for task in tasks_to_put_back_in_queue:
            self.task_queue.put(task)
            get_metric(TASK_INSUFFICIENT_OFFER_COUNT).count(1)

        return tasks_to_launch
Exemple #12
0
def emit_counter_metric(counter_name, service, instance):
    create_counter(counter_name, {"service": service, "instance": instance})
    get_metric(counter_name).count(1)