def init_notice(raw, logger): global Notice # pylint: disable=W0603 if Notice is None: from assemblyline.al.common.notice import Notice logger.info('Sending alert: %s', str(raw)) return Notice(raw)
def dropper(): # df node def datastore = forge.get_datastore() while running: raw = dropq.pop(timeout=1) # df pull pop if not raw: continue notice = Notice(raw) send_notification(notice) c12n = notice.get('classification', config.core.middleman.classification) expiry = now_as_iso(86400) sha256 = notice.get('sha256') datastore.save_or_freshen_file(sha256, {'sha256': sha256}, expiry, c12n) datastore.close()
def retry(raw, scan_key, sha256, ex): # df node def current_time = now() notice = Notice(raw) retries = notice.get('retries', 0) + 1 if retries > max_retries: trace = '' if ex and type(ex) != FileStoreException: trace = ': ' + get_stacktrace_info(ex) logger.error('Max retries exceeded for %s%s', sha256, trace) dupq.delete(dup_prefix + scan_key) elif expired(current_time - seconds(notice.get('ts', current_time)), 0): logger.info('No point retrying expired submission for %s', sha256) dupq.delete(dup_prefix + scan_key) # df pull delete else: logger.info('Requeuing %s (%s)', sha256, ex or 'unknown') notice.set('retries', retries) notice.set('retry_at', now(retry_delay)) retryq.push(notice.raw) # df push push
def init(): datastore = forge.get_datastore() datastore.commit_index('submission') sids = [ x['submission.sid'] for x in datastore.stream_search( 'submission', 'state:submitted AND times.submitted:[NOW-1DAY TO *] ' 'AND submission.metadata.type:* ' 'AND NOT submission.description:Resubmit*') ] submissions = {} submitted = {} for submission in datastore.get_submissions(sids): task = Task(submission) if not task.original_selected or not task.root_sha256 or not task.scan_key: continue if forge.determine_ingest_queue(task.root_sha256) != ingestq_name: continue scan_key = task.scan_key submissions[task.sid] = submission submitted[scan_key] = task.sid # Outstanding is the set of things Riak believes are being scanned. outstanding = set(submitted.keys()) # Keys is the set of things middleman believes are being scanned. keys = set(scanning.keys()) # Inflight is the set of submissions middleman and Riak agree are inflight. inflight = outstanding.intersection(keys) # Missing is the set of submissions middleman thinks are in flight but # according to Riak are not incomplete. missing = keys.difference(inflight) # Process the set of submissions Riak believes are incomplete but # middleman doesn't know about. for scan_key in outstanding.difference(inflight): sid = submitted.get(scan_key, None) if not sid: logger.info("Init: No sid found for incomplete") continue if not task.original_selected or not task.root_sha256 or not task.scan_key: logger.info("Init: Not root_sha256 or original_selected") continue submission = submissions[sid] task = Task(submission) if not task.metadata: logger.info("Init: Incomplete submission is not one of ours: %s", sid) stype = None try: stype = task.metadata.get('type', None) except: # pylint: disable=W0702 logger.exception( "Init: Incomplete submission has malformed metadata: %s", sid) if not stype: logger.info("Init: Incomplete submission missing type: %s", sid) raw = { 'metadata': task.metadata, 'overrides': get_submission_overrides(task, overrides), 'sha256': task.root_sha256, 'type': stype, } raw['overrides']['selected'] = task.original_selected reinsert(datastore, " (incomplete)", Notice(raw), logger) r = redis.StrictRedis(persistent['host'], persistent['port'], persistent['db']) # Duplicates is the set of sha256s where a duplicate queue exists. duplicates = [ x.replace(dup_prefix, '', 1) for x in r.keys(dup_prefix + '*') ] # Process the set of duplicates where no scanning or riak entry exists. for scan_key in set(duplicates).difference(outstanding.union(keys)): raw = dupq.pop(dup_prefix + scan_key, blocking=False) if not raw: logger.warning("Init: Couldn't pop off dup queue (%s)", scan_key) dupq.delete(dup_prefix + scan_key) continue reinsert(datastore, " (missed duplicate)", Notice(raw), logger) while True: res = completeq.pop(blocking=False) if not res: break scan_key = completed(Task(res)) try: missing.remove(scan_key) except: # pylint: disable=W0702 pass # Process the set of submissions middleman thinks are in flight but # according to Riak are not incomplete. for scan_key in missing: raw = scanning.pop(scan_key) if raw: reinsert(datastore, '', Notice(raw), logger, retry_all=False) # Set up time outs for all inflight submissions. expiry_time = now(max_time) for scan_key in inflight: # No need to lock. We're the only thing running at this point. timeouts.append(Timeout(scan_key, expiry_time)) signal.signal(signal.SIGINT, interrupt) signal.signal(signal.SIGTERM, interrupt) datastore.close()
def ingest(datastore, user_groups, raw): # df node def notice = Notice(raw) ignore_size = notice.get('ignore_size', False) never_drop = notice.get('never_drop', False) sha256 = notice.get('sha256') size = notice.get('size', 0) # Make sure we have a submitter ... user = notice.get('submitter', None) if user is None: user = config.submissions.user notice.set('submitter', user) # ... and groups. groups = notice.get('groups', None) if groups is None: groups = user_groups.get(user, None) if groups is None: ruser = datastore.get_user(user) if not ruser: return groups = ruser.get('groups', []) user_groups[user] = groups notice.set('groups', groups) selected = notice.get('selected', None) if not selected: selected = selected_initial notice.set('selected', selected) notice.set('resubmit_to', ['Dynamic Analysis']) resubmit_to = notice.get('resubmit_to', None) if resubmit_to is None: notice.set('resubmit_to', []) ingester_counts.increment('ingest.bytes_ingested', int(size)) ingester_counts.increment('ingest.submissions_ingested') if not sha256: send_notification(notice, failure="Invalid sha256", logfunc=logger.warning) return c12n = notice.get('classification', '') if not Classification.is_valid(c12n): send_notification(notice, failure="Invalid classification %s" % c12n, logfunc=logger.warning) return metadata = notice.get('metadata', {}) if isinstance(metadata, dict): to_delete = [] for k, v in metadata.iteritems(): size = sys.getsizeof(v, -1) if isinstance(v, basestring): size = len(v) if size > config.core.middleman.max_value_size: to_delete.append(k) elif size < 0: to_delete.append(k) if to_delete: logger.info('Removing %s from %s', to_delete, notice.raw) for k in to_delete: metadata.pop(k, None) if size > config.submissions.max.size and not ignore_size and not never_drop: notice.set( 'failure', "File too large (%d > %d)" % (size, config.submissions.max.size)) dropq.push(notice.raw) # df push push ingester_counts.increment('ingest.skipped') return pprevious, previous, score = None, False, None if not notice.get('ignore_cache', False): pprevious, previous, score, _ = check(datastore, notice) # Assign priority. low_priority = is_low_priority(notice) priority = notice.get('priority') if priority is None: priority = priority_value['medium'] if score is not None: priority = priority_value['low'] for level in ('critical', 'high'): if score >= threshold_value[level]: priority = priority_value[level] break elif low_priority: priority = priority_value['low'] # Reduce the priority by an order of magnitude for very old files. current_time = now() if priority and \ expired(current_time - seconds(notice.get('ts', current_time)), 0): priority = (priority / 10) or 1 notice.set('priority', priority) # Do this after priority has been assigned. # (So we don't end up dropping the resubmission). if previous: ingester_counts.increment('ingest.duplicates') finalize(pprevious, previous, score, notice) # df push calls return if drop(notice): # df push calls return if is_whitelisted(notice): # df push calls return uniqueq.push(priority, notice.raw) # df push push
def completed(task): # df node def sha256 = task.root_sha256 psid = task.psid score = task.score sid = task.sid scan_key = task.scan_key with ScanLock(scan_key): # Remove the entry from the hash of submissions in progress. raw = scanning.pop(scan_key) # df pull pop if not raw: logger.warning("Untracked submission (score=%d) for: %s %s", int(score), sha256, str(task.metadata)) # Not a result we care about. We are notified for every # submission that completes. Some submissions will not be ours. if task.metadata: stype = None try: stype = task.metadata.get('type', None) except: # pylint: disable=W0702 logger.exception("Malformed metadata: %s:", sid) if not stype: return scan_key if (task.description or '').startswith(default_prefix): raw = { 'metadata': task.metadata, 'overrides': get_submission_overrides(task, overrides), 'sha256': sha256, 'type': stype, } finalize(psid, sid, score, Notice(raw)) return scan_key errors = task.raw.get('error_count', 0) file_count = task.raw.get('file_count', 0) ingester_counts.increment('ingest.submissions_completed') ingester_counts.increment('ingest.files_completed', file_count) ingester_counts.increment('ingest.bytes_completed', int(task.size or 0)) notice = Notice(raw) with cache_lock: _add(scan_key, psid, sid, score, errors, now()) finalize(psid, sid, score, notice) # df push calls def exhaust(): while True: res = dupq.pop( # df pull pop dup_prefix + scan_key, blocking=False) if res is None: break yield res # You may be tempted to remove the assignment to dups and use the # value directly in the for loop below. That would be a mistake. # The function finalize may push on the duplicate queue which we # are pulling off and so condensing those two lines creates a # potential infinite loop. dups = [dup for dup in exhaust()] for dup in dups: finalize(psid, sid, score, Notice(dup)) return scan_key
def submitter(): # df node def client = forge.get_submission_service() datastore = forge.get_datastore() while running: try: raw = submissionq.pop(timeout=1) # df pull pop if not raw: continue # noinspection PyBroadException try: sha256 = raw['sha256'] except Exception: # pylint: disable=W0703 logger.exception("Malformed entry on submission queue:") continue if not sha256: logger.error("Malformed entry on submission queue: %s", raw) continue notice = Notice(raw) if drop(notice): # df push calls continue if is_whitelisted(notice): # df push calls continue pprevious, previous, score = None, False, None if not notice.get('ignore_cache', False): pprevious, previous, score, scan_key = check(datastore, notice) if previous: if not notice.get('resubmit_to', []) and not pprevious: logger.warning( "No psid for what looks like a resubmission of %s: %s", sha256, scan_key) finalize(pprevious, previous, score, notice) # df push calls continue with ScanLock(scan_key): if scanning.exists(scan_key): logger.debug('Duplicate %s', sha256) ingester_counts.increment('ingest.duplicates') dupq.push(dup_prefix + scan_key, notice.raw) # df push push continue scanning.add(scan_key, notice.raw) # df push add ex = return_exception(submit, client, notice) if not ex: continue ingester_counts.increment('ingest.error') should_retry = True tex = type(ex) if tex == FileStoreException: ex = tex("Problem with file: %s" % sha256) elif tex == CorruptedFileStoreException: logger.error( "Submission failed due to corrupted filestore: %s" % ex.message) should_retry = False else: trace = get_stacktrace_info(ex) logger.error("Submission failed: %s", trace) raw = scanning.pop(scan_key) if not raw: logger.error('No scanning entry for for %s', sha256) continue if not should_retry: continue retry(raw, scan_key, sha256, ex) if tex == riak.RiakError: raise ex # pylint: disable=E0702 except Exception: # pylint:disable=W0703 logger.exception("Unexpected error")