def redispatch(self, name, sid, srl, service, reason, now): entry = None try: entry = self.entries[sid][srl] except KeyError: return False try: stage = self.service_manager.stage_by_name(service.name) d = getattr(entry, name)[stage] c = entry.completed_services[stage] if service.name in c or d and service.name in d: return False log.info("%s for %s: %s/%s", reason, service.name, sid, srl) self.dispatch(service, entry, now) return True except Exception as ex: #pylint: disable=W0703 trace = get_stacktrace_info(ex) log.error("Couldn't redispatch to %s for %s/%s: %s", service.name, sid, srl, trace) response = Task(deepcopy(entry.task.raw)) response.watermark(service.name, '') response.nonrecoverable_failure(trace) self.storage_queue.push({ 'type': 'error', 'name': service.name, 'response': response, }) return False
def _drain(self): with self._current_work_items_lock: if not self._current_work_items: self.log.info('EXIT_DRAIN:0') return result_store = forge.get_datastore() dispatch_queue = forge.get_dispatch_queue() self.log.info('EXIT_DRAIN:%s', len(self._current_work_items)) for item in self._current_work_items: work = Task(item) task = Task({}) task.sid = work.sid task.srl = work.srl task.dispatch_queue = work.dispatch_queue task.classification = work.classification self.log.info("DRAIN: %s/%s", task.sid, task.srl) task.watermark(self.service_cls.SERVICE_NAME, None) task.recoverable_failure( 'Task was pre-empted (shutdown, vm revert or cull)') task.cache_key = result_store.save_error( self.service_cls.SERVICE_NAME, None, None, task) dispatch_queue.send_raw(task.as_dispatcher_response())
def dispatch(self, service, entry, now): task = entry.task sid = task.sid srl = task.srl name = service.name queue_size = self.queue_size[name] = self.queue_size.get(name, 0) + 1 entry.retries[name] = entry.retries.get(name, -1) + 1 if task.profile: if entry.retries[name]: log.info('%s Graph: "%s" -> "%s/%s" [label=%d];', sid, srl, srl, name, entry.retries[name]) else: log.info('%s Graph: "%s" -> "%s/%s";', sid, srl, srl, name) log.info('%s Graph: "%s/%s" [label=%s];', sid, srl, name, name) file_count = len(self.entries[sid]) + len(self.completed[sid]) # Warning: Please do not change the text of the error messages below. msg = None if self._service_is_down(service, now): msg = 'Service down.' elif entry.retries[name] > config.core.dispatcher.max.retries: msg = 'Max retries exceeded.' elif entry.retries[name] >= 1: log.debug("Retry sending %s/%s to %s", sid, srl, name) elif task.depth > config.core.dispatcher.max.depth: msg = 'Max depth exceeded.' elif file_count > config.core.dispatcher.max.files: msg = 'Max files exceeded.' if msg: log.debug(' '.join((msg, "Not sending %s/%s to %s." % \ (sid, srl, name)))) response = Task(deepcopy(task.raw)) response.watermark(name, '') response.nonrecoverable_failure(msg) self.storage_queue.push({ 'type': 'error', 'name': name, 'response': response, }) return False if service.skip(task): response = Task(deepcopy(task.raw)) response.watermark(name, '') response.success() q.send_raw(response.as_dispatcher_response()) return False # Setup an ack timeout. seconds = min(service.timeout * (queue_size + 5), 7200) task.ack_timeout = seconds task.sent = now service.proxy.execute(task.priority, task.as_service_request(name)) # Add the timeout to the end of its respective list. ack_timeout = self.ack_timeout lst = ack_timeout.get(seconds, []) lst.append(Timeout(sid, srl, name, now + seconds)) ack_timeout[seconds] = lst return True
logger.info("Monitoring the following service queues: %s", threshold) while True: queue_lengths = get_service_queue_lengths() over = { k: v for k, v in queue_lengths.iteritems() if v > (threshold.get(k, 0) or v) } for name, size in over.iteritems(): excess = size - threshold.get(name, size) if excess <= 0: continue for msg in get_queue(name).unpush(excess): # noinspection PyBroadException try: t = Task(msg) t.watermark(name, '') t.nonrecoverable_failure('Service busy.') t.cache_key = store.save_error(name, None, None, t) dispatch_queue.send_raw(t.as_dispatcher_response()) logger.info("%s is too busy to process %s.", name, t.srl) except: # pylint:disable=W0702 logger.exception('Problem sending response:') time.sleep(config.system.update_interval)