def send_raw(self, raw, shards=None): if not shards: config = forge.get_config() shards = config.core.dispatcher.shards task = Task(raw) self.send(task, shards)
def redispatch(self, name, sid, srl, service, reason, now): entry = None try: entry = self.entries[sid][srl] except KeyError: return False try: stage = self.service_manager.stage_by_name(service.name) d = getattr(entry, name)[stage] c = entry.completed_services[stage] if service.name in c or d and service.name in d: return False log.info("%s for %s: %s/%s", reason, service.name, sid, srl) self.dispatch(service, entry, now) return True except Exception as ex: #pylint: disable=W0703 trace = get_stacktrace_info(ex) log.error("Couldn't redispatch to %s for %s/%s: %s", service.name, sid, srl, trace) response = Task(deepcopy(entry.task.raw)) response.watermark(service.name, '') response.nonrecoverable_failure(trace) self.storage_queue.push({ 'type': 'error', 'name': service.name, 'response': response, }) return False
def ingester(): # df node def # pylint:disable=R0912 datastore = forge.get_datastore() user_groups = {} # Move from ingest to unique and waiting queues. # While there are entries in the ingest queue we consume chunk_size # entries at a time and move unique entries to uniqueq / queued and # duplicates to their own queues / waiting. while running: while True: result = completeq.pop(blocking=False) # df pull pop if not result: break completed(Task(result)) # df push calls entry = ingestq.pop(timeout=1) # df pull pop if not entry: continue trafficq.push(entry) # df push push sha256 = entry.get('sha256', '') if not sha256 or len(sha256) != 64: logger.error("Invalid sha256: %s", entry) continue entry['md5'] = entry.get('md5', '').lower() entry['sha1'] = entry.get('sha1', '').lower() entry['sha256'] = sha256.lower() ingest(datastore, user_groups, entry) # df push calls datastore.close()
def _send_control_queue_call(cls, shard, state, **kw): name = reply_queue_name(state) kw.update({ 'state': state, 'watch_queue': name, }) t = Task({}, **kw) forge.get_control_queue('control-queue-' + str(shard)).push(t.raw) nq = NamedQueue(name) return nq.pop(timeout=5)
def _do_work(self, raw_task): """ Complete an incoming work item. Note: This will block while a service is executing the task. For some services this could be many seconds or even minutes. """ assert not isinstance(raw_task, list) task = Task(raw_task) # noinspection PyProtectedMember self.service._handle_task(task) self.work_count.value += 1
def _drain(self): with self._current_work_items_lock: if not self._current_work_items: self.log.info('EXIT_DRAIN:0') return result_store = forge.get_datastore() dispatch_queue = forge.get_dispatch_queue() self.log.info('EXIT_DRAIN:%s', len(self._current_work_items)) for item in self._current_work_items: work = Task(item) task = Task({}) task.sid = work.sid task.srl = work.srl task.dispatch_queue = work.dispatch_queue task.classification = work.classification self.log.info("DRAIN: %s/%s", task.sid, task.srl) task.watermark(self.service_cls.SERVICE_NAME, None) task.recoverable_failure( 'Task was pre-empted (shutdown, vm revert or cull)') task.cache_key = result_store.save_error( self.service_cls.SERVICE_NAME, None, None, task) dispatch_queue.send_raw(task.as_dispatcher_response())
def _check_time_drift(self): dispatcher = '0' name = reply_queue_name('cli_get_time') t = Task({}, **{ 'state': 'get_system_time', 'watch_queue': name, }) forge.get_control_queue('control-queue-' + dispatcher).push(t.raw) nq = NamedQueue(name) r = nq.pop(timeout=5) if r is None or 'time' not in r: self.log.warn('timed out trying to determine dispatchers clock.') return clock_difference = abs(r['time'] - time.time()) if clock_difference > 600: self.log.info( 'Dispatchers clock %s away from ours. Clocks are not set correctly', clock_difference) else: self.log.debug('Clock drift from dispatcher: %s.', clock_difference)
def dispatch(self, service, entry, now): task = entry.task sid = task.sid srl = task.srl name = service.name queue_size = self.queue_size[name] = self.queue_size.get(name, 0) + 1 entry.retries[name] = entry.retries.get(name, -1) + 1 if task.profile: if entry.retries[name]: log.info('%s Graph: "%s" -> "%s/%s" [label=%d];', sid, srl, srl, name, entry.retries[name]) else: log.info('%s Graph: "%s" -> "%s/%s";', sid, srl, srl, name) log.info('%s Graph: "%s/%s" [label=%s];', sid, srl, name, name) file_count = len(self.entries[sid]) + len(self.completed[sid]) # Warning: Please do not change the text of the error messages below. msg = None if self._service_is_down(service, now): msg = 'Service down.' elif entry.retries[name] > config.core.dispatcher.max.retries: msg = 'Max retries exceeded.' elif entry.retries[name] >= 1: log.debug("Retry sending %s/%s to %s", sid, srl, name) elif task.depth > config.core.dispatcher.max.depth: msg = 'Max depth exceeded.' elif file_count > config.core.dispatcher.max.files: msg = 'Max files exceeded.' if msg: log.debug(' '.join((msg, "Not sending %s/%s to %s." % \ (sid, srl, name)))) response = Task(deepcopy(task.raw)) response.watermark(name, '') response.nonrecoverable_failure(msg) self.storage_queue.push({ 'type': 'error', 'name': name, 'response': response, }) return False if service.skip(task): response = Task(deepcopy(task.raw)) response.watermark(name, '') response.success() q.send_raw(response.as_dispatcher_response()) return False # Setup an ack timeout. seconds = min(service.timeout * (queue_size + 5), 7200) task.ack_timeout = seconds task.sent = now service.proxy.execute(task.priority, task.as_service_request(name)) # Add the timeout to the end of its respective list. ack_timeout = self.ack_timeout lst = ack_timeout.get(seconds, []) lst.append(Timeout(sid, srl, name, now + seconds)) ack_timeout[seconds] = lst return True
def init(): datastore = forge.get_datastore() datastore.commit_index('submission') sids = [ x['submission.sid'] for x in datastore.stream_search( 'submission', 'state:submitted AND times.submitted:[NOW-1DAY TO *] ' 'AND submission.metadata.type:* ' 'AND NOT submission.description:Resubmit*') ] submissions = {} submitted = {} for submission in datastore.get_submissions(sids): task = Task(submission) if not task.original_selected or not task.root_sha256 or not task.scan_key: continue if forge.determine_ingest_queue(task.root_sha256) != ingestq_name: continue scan_key = task.scan_key submissions[task.sid] = submission submitted[scan_key] = task.sid # Outstanding is the set of things Riak believes are being scanned. outstanding = set(submitted.keys()) # Keys is the set of things middleman believes are being scanned. keys = set(scanning.keys()) # Inflight is the set of submissions middleman and Riak agree are inflight. inflight = outstanding.intersection(keys) # Missing is the set of submissions middleman thinks are in flight but # according to Riak are not incomplete. missing = keys.difference(inflight) # Process the set of submissions Riak believes are incomplete but # middleman doesn't know about. for scan_key in outstanding.difference(inflight): sid = submitted.get(scan_key, None) if not sid: logger.info("Init: No sid found for incomplete") continue if not task.original_selected or not task.root_sha256 or not task.scan_key: logger.info("Init: Not root_sha256 or original_selected") continue submission = submissions[sid] task = Task(submission) if not task.metadata: logger.info("Init: Incomplete submission is not one of ours: %s", sid) stype = None try: stype = task.metadata.get('type', None) except: # pylint: disable=W0702 logger.exception( "Init: Incomplete submission has malformed metadata: %s", sid) if not stype: logger.info("Init: Incomplete submission missing type: %s", sid) raw = { 'metadata': task.metadata, 'overrides': get_submission_overrides(task, overrides), 'sha256': task.root_sha256, 'type': stype, } raw['overrides']['selected'] = task.original_selected reinsert(datastore, " (incomplete)", Notice(raw), logger) r = redis.StrictRedis(persistent['host'], persistent['port'], persistent['db']) # Duplicates is the set of sha256s where a duplicate queue exists. duplicates = [ x.replace(dup_prefix, '', 1) for x in r.keys(dup_prefix + '*') ] # Process the set of duplicates where no scanning or riak entry exists. for scan_key in set(duplicates).difference(outstanding.union(keys)): raw = dupq.pop(dup_prefix + scan_key, blocking=False) if not raw: logger.warning("Init: Couldn't pop off dup queue (%s)", scan_key) dupq.delete(dup_prefix + scan_key) continue reinsert(datastore, " (missed duplicate)", Notice(raw), logger) while True: res = completeq.pop(blocking=False) if not res: break scan_key = completed(Task(res)) try: missing.remove(scan_key) except: # pylint: disable=W0702 pass # Process the set of submissions middleman thinks are in flight but # according to Riak are not incomplete. for scan_key in missing: raw = scanning.pop(scan_key) if raw: reinsert(datastore, '', Notice(raw), logger, retry_all=False) # Set up time outs for all inflight submissions. expiry_time = now(max_time) for scan_key in inflight: # No need to lock. We're the only thing running at this point. timeouts.append(Timeout(scan_key, expiry_time)) signal.signal(signal.SIGINT, interrupt) signal.signal(signal.SIGTERM, interrupt) datastore.close()
def _do_work(self, work): tasks = [Task(raw) for raw in work] # noinspection PyProtectedMember self.service._handle_task_batch(tasks)
logger.info("Monitoring the following service queues: %s", threshold) while True: queue_lengths = get_service_queue_lengths() over = { k: v for k, v in queue_lengths.iteritems() if v > (threshold.get(k, 0) or v) } for name, size in over.iteritems(): excess = size - threshold.get(name, size) if excess <= 0: continue for msg in get_queue(name).unpush(excess): # noinspection PyBroadException try: t = Task(msg) t.watermark(name, '') t.nonrecoverable_failure('Service busy.') t.cache_key = store.save_error(name, None, None, t) dispatch_queue.send_raw(t.as_dispatcher_response()) logger.info("%s is too busy to process %s.", name, t.srl) except: # pylint:disable=W0702 logger.exception('Problem sending response:') time.sleep(config.system.update_interval)