def build_fake_DB(hosts=10, seed=random.randint(0, 10000), database_file=None): hostnames_set = set() filePaths_dict = defaultdict(int) filePaths_dict_ID = 0 filePaths_dict_ID_skip = 0 random.seed(seed) fake.seed(seed) fake_ES.seed(seed) if database_file == None: # Get temp db name for the test tempdb = tempfile.NamedTemporaryFile(suffix='.db', prefix='testCase', dir=tempfile.gettempdir()) tempdb.close() database_file = tempdb.name if os.path.isfile(database_file): logger.warning("Adding hosts to existing database") with appDB.DBClass(database_file, "False", settings.__version__) as DB: conn = DB.appConnectDB() # Load existing hosts data = DB.Query("SELECT HostName FROM Hosts") for hostName in data: hostnames_set.add(hostName[0]) # Load existing paths data = DB.Query("SELECT FilePathID, FilePath FROM FilePaths") for filePathID, FilePath in data: filePaths_dict[FilePath] = (filePathID) filePaths_dict_ID += 1 filePaths_dict_ID_skip = filePaths_dict_ID else: with appDB.DBClass(database_file, "True", settings.__version__) as DB: DB.appInitDB() DB.appSetIndex() conn = DB.appConnectDB() DB.appRequireIndexesDB( "index_EntriesHostName", "CREATE INDEX index_EntriesHostName on Hosts(HostName)") DB.appRequireIndexesDB( "index_FilePathsFilePath", "CREATE INDEX index_FilePathsFilePath on FilePaths(FilePath)") with appDB.DBClass(database_file, "False", settings.__version__) as DB: conn = DB.appConnectDB() # Start creating hosts and data: rowList = [] insertList = [] numFields = 29 - 3 valuesQuery = "(NULL," + "?," * numFields + "0, 0)" progressCurrent = 0 progressTotal = hosts for i in xrange(0, hosts): progressCurrent += 1 update_progress(float(progressCurrent) / float(progressTotal)) HostName = "" while True: HostName = strip_accents( (fake_ES.color_name() + fake_ES.country()).replace( ' ', '')) HostName = strip_non_ascii(HostName) HostName += "_" + str(random.randint(000, 999)) if HostName not in hostnames_set: hostnames_set.add(HostName) break print "Creating appcompat/amcache data for host: %s" % HostName Instances = ['dummy'] InstancesCounter = 1 Recon = 0 ReconScoring = 0 DB.ExecuteMany("INSERT INTO Hosts VALUES (NULL,?,?,?,?,?)", [(HostName, str(repr(Instances)), InstancesCounter, Recon, ReconScoring)]) HostID = DB.Query( "SELECT HostID FROM Hosts WHERE HostName = '%s'" % HostName)[0][0] # Sampled 2K hosts, this should statistically provide a somewhat realistic amount of entries (for AppCompat) for i in xrange(1, random.randint(400, 800)): # EntryType = random.choice([settings.__APPCOMPAT__,settings.__AMCACHE__]) EntryType = settings.__APPCOMPAT__ RowNumber = 0 LastModified = str(fake.date_time_between('-1y')) + "." + str( random.randint(1, 9999)) LastUpdate = str(fake.date_time_between('-4y')) + "." + str( random.randint(1, 9999)) filePathID = 0 # todo: FilePath retains final backslash on root paths (c:\, d:\ ...) remove. FilePath, FileName = ntpath.split(fake.path()) FilePath = FilePath.lower() FileName = FileName.lower() Size = random.randint(1, 100000) if EntryType == settings.__APPCOMPAT__: ExecFlag = random.choice(['True', 'False']) else: ExecFlag = 'True' if EntryType == settings.__AMCACHE__: SHA1 = fake.sha1() FileDescription = random.choice( ['', '', '', '', '', '', '', '', '', '', fake.text()]) FirstRun = str(fake.date_time_between('-1y')) + "." + str( random.randint(1, 9999)) Created = str(fake.date_time_between('-5y')) + "." + str( random.randint(1, 9999)) Modified1 = str(fake.date_time_between('-5y')) + "." + str( random.randint(1, 9999)) Modified2 = str(fake.date_time_between('-5y')) + "." + str( random.randint(1, 9999)) LinkerTS = str(fake.date_time_between('-10y')) Company = fake.company() PE_sizeofimage = random.randint(1, 10000) # Redo re-assignment of date we do on load for AmCache LastUpdate = FirstRun LastModified = Modified2 else: SHA1 = '' FileDescription = '' FirstRun = '' Created = '' Modified1 = '' Modified2 = '' LinkerTS = '' Company = '' PE_sizeofimage = '' Product = 0 Version_number = 0 Version = 0 Language = 0 Header_hash = 0 PE_checksum = 0 SwitchBackContext = 0 InstanceID = 0 # # Add FilePath if not there yet # DB.Execute("INSERT OR IGNORE INTO FilePaths VALUES (NULL, '%s')" % FilePath) # # Get FilePathID # FilePathID = DB.QueryInt("SELECT FilePathID FROM FilePaths WHERE FilePath = '%s'" % FilePath) if FilePath not in filePaths_dict: filePaths_dict[FilePath] = (filePaths_dict_ID) filePathID = filePaths_dict_ID filePaths_dict_ID += 1 else: filePathID = filePaths_dict[FilePath] insertList.append( (HostID, EntryType, RowNumber, LastModified, LastUpdate, filePathID, FileName, Size, ExecFlag, SHA1, FileDescription, FirstRun, Created, Modified1, Modified2, LinkerTS, Product, Company, PE_sizeofimage, Version_number, Version, Language, Header_hash, PE_checksum, SwitchBackContext, InstanceID)) # Dump every now and then: if len(insertList) > 1000000: logger.info("Dumping data to DB") DB.ExecuteMany("INSERT INTO Entries VALUES " + valuesQuery, insertList) insertList = [] # Insert last bucket logger.info("Dumping last bucket to DB") DB.ExecuteMany("INSERT INTO Entries VALUES " + valuesQuery, insertList) # Insert new FilePaths list_FilePath_ID = [(v, k) for k, v in filePaths_dict.items()] list_FilePath_ID.sort(key=lambda tup: tup[0]) DB.ExecuteMany("INSERT INTO FilePaths VALUES (?,?)", list_FilePath_ID[filePaths_dict_ID_skip:]) return database_file
def appLoadMP(pathToLoad, dbfilenameFullPath, maxCores, governorOffFlag): global _tasksPerJob files_to_process = [] conn = None # Start timer t0 = datetime.now() logger.debug("Starting appLoadMP") # Calculate aggreagate file_filter for all ingest types supported: file_filter = '|'.join([v.getFileNameFilter() for k,v in ingest_plugins.iteritems()]) # Add zip extension file_filter += "|.*\.zip" # Check if we're loading Redline data if os.path.isdir(pathToLoad) and os.path.basename(pathToLoad).lower() == 'RedlineAudits'.lower(): files_to_process = searchRedLineAudits(pathToLoad) else: # Search for all files to be processed if os.path.isdir(pathToLoad): files_to_process = searchFolders(pathToLoad, file_filter) else: files_to_process = processArchives(pathToLoad, file_filter) if files_to_process: # Init DB if required DB = appDB.DBClass(dbfilenameFullPath, True, settings.__version__) conn = DB.appConnectDB() # Extract hostnames, grab existing host IDs from DB and calculate instance ID for new IDs to be ingested: instancesToProcess = [] instancesToProcess += GetIDForHosts(files_to_process, DB) countInstancesToProcess = len(instancesToProcess) logger.info("Found %d new instances" % (countInstancesToProcess)) # Setup producers/consumers initial counts num_consumers = 1 num_producers = 1 # Setup MPEngine mpe = MPEngineProdCons(maxCores, appLoadProd, appLoadCons, governorOffFlag) # Reduce _tasksPerJob for small jobs if countInstancesToProcess < _tasksPerJob: _tasksPerJob = 1 # Create task list task_list = [] instancesPerJob = _tasksPerJob num_tasks = 0 for chunk in chunks(instancesToProcess, instancesPerJob): # todo: We no longer need pathToLoad as tasks include the fullpath now task_list.append(Task(pathToLoad, chunk)) num_tasks += 1 if num_tasks > 0: # Check if we have to drop indexes to speedup insertions # todo: Research ratio of existing hosts to new hosts were this makes sense if countInstancesToProcess > 1000 or DB.CountHosts() < 1000: DB.appDropIndexesDB() # Queue tasks for Producers mpe.addTaskList(task_list) # Start procs mpe.startProducers(num_producers) mpe.startConsumers(num_consumers, [dbfilenameFullPath]) # mpe.addProducer() # Control loop while mpe.working(): time.sleep(1.0) (num_producers,num_consumers,num_tasks,progress_producers,progress_consumers) = mpe.getProgress() elapsed_time = datetime.now() - t0 mean_loadtime_per_host = (elapsed_time) / max(1, _tasksPerJob * progress_consumers) pending_hosts = ((num_tasks * _tasksPerJob) - (_tasksPerJob * progress_consumers)) etr = (mean_loadtime_per_host * pending_hosts) eta = t0 + elapsed_time + etr ett = (eta - t0) if settings.logger_getDebugMode(): status_extra_data = " Prod: %s Cons: %s (%d -> %d -> %d: %d) [RAM: %d%% / Obj: %d / ETH: %s / ETA: %s / ETT: %s]" % \ (num_producers, num_consumers, num_tasks, progress_producers, progress_consumers, progress_producers - progress_consumers, psutil_phymem_usage(), len(gc.get_objects()), mean_loadtime_per_host if progress_consumers * _tasksPerJob > 100 else "N/A", str(eta.time()).split(".")[0] if progress_consumers * _tasksPerJob > 100 else "N/A", str(ett).split(".")[0] if progress_consumers * _tasksPerJob > 100 else "N/A") else: status_extra_data = "" # logger.info("Parsing files%s" % status_extra_data) logger.info(update_progress(min(1,float(progress_consumers) / float(num_tasks)), "Parsing files%s" % status_extra_data, True)) mpe.rebalance() del mpe # Stop timer elapsed_time = datetime.now() - t0 mean_loadtime_per_host = (elapsed_time) / max(1, countInstancesToProcess) logger.info("Load speed: %s seconds / file" % (mean_loadtime_per_host)) logger.info("Load time: %s" % (str(elapsed_time).split(".")[0])) else: logger.info("Found no files to process!")
def appSearchMP(dbfilenameFullPath, searchType, search_space, options): (outputFile, maxCores) = (options.outputFile, options.maxCores) known_bad_data = None # Start timer t0 = time.time() DB = appDB.DBClass(dbfilenameFullPath, True, settings.__version__) conn = DB.appConnectDB() # If possible use the available indexes if hasattr( options, 'field_name' ) and searchType == 'LITERAL' and options.searchLiteral[0][0] not in [ '=', '>', '<' ] and DB.appIndexExistsDB(options.field_name): num_hits = namedtuple('hits', 'value') num_hits_suppressed = namedtuple('hits', 'value') (num_hits.value, num_hits_suppressed.value, results) = runIndexedSearch(dbfilenameFullPath, search_space, options) else: # Get total number of entries to search entriesCount = DB.CountEntries() logger.debug("Total entries in search space: %d" % entriesCount) # Pre-load known_bad if required if searchType == 'KNOWNBAD': known_bad_data = LoadRegexBulkSearch(options.knownbad_file) # Establish communication queues tasks = multiprocessing.JoinableQueue() resultsProducers = multiprocessing.Queue() resultsConsumers = multiprocessing.Queue() hitHistogram_queue = multiprocessing.Queue() # Start producers/consumers num_consumers = 1 num_producers = max(1, maxCores - 1) # Prep lock for progress update Producers progProducers = multiprocessing.Value('i', 0) # Prep lock for progress update Consumers progConsumers = multiprocessing.Value('i', 0) # Prep Consumers return values num_hits = multiprocessing.Value('i', 0) num_hits_suppressed = multiprocessing.Value('i', 0) logger.debug( 'Using %d cores for searching / %d cores for dumping results' % (num_producers, num_consumers)) # Queue tasks for Producers # Limit rowsPerJob to constrain memory use and ensure reasonable progress updates rowsPerJob = min((entriesCount / 8), 5000) logger.debug("RowsPerJob: %d" % rowsPerJob) num_tasks = 0 for startingRowID in range(0, entriesCount - rowsPerJob, rowsPerJob): tasks.put(Task(startingRowID, rowsPerJob - 1)) logger.debug( "Creating search job %d: [%d - %d]" % (num_tasks, startingRowID, startingRowID + rowsPerJob - 1)) num_tasks += 1 logger.debug("Creating search job %d: [%d - %d]" % (num_tasks, num_tasks * (rowsPerJob), ((num_tasks * rowsPerJob) + (entriesCount - (num_tasks * (rowsPerJob) - 1))))) # Special consideration for the last one: tasks.put( Task(num_tasks * (rowsPerJob), (entriesCount - ((num_tasks * rowsPerJob) - 1)))) logger.debug("Number of tasks: %d" % num_tasks) # Add a poison pill for each producer for i in xrange(num_producers): tasks.put(None) # Start producer threads producers = [Producer(tasks, resultsProducers, dbfilenameFullPath, progProducers, num_consumers, \ searchType, search_space, options, num_hits, known_bad_data) for i in xrange(num_producers)] for producer in producers: producer.daemon = True # Remove for debugging producer.start() # Start consumer threads consumers = [Consumer(resultsProducers, resultsConsumers, progConsumers, num_producers, outputFile, \ dbfilenameFullPath, searchType, search_space, options, num_hits, \ num_hits_suppressed, hitHistogram_queue, known_bad_data) for i in xrange(num_consumers)] for consumer in consumers: consumer.daemon = True # Remove for debugging consumer.start() # Producer progress loop while (num_tasks > progProducers.value and progProducers.value >= 0): logger.debug("Producer num_tasks: %d - v.value: %d" % (num_tasks, progProducers.value)) update_progress( min(1, float(progProducers.value) / float(num_tasks)), "Searching [%d]" % (num_hits.value - num_hits_suppressed.value)) time.sleep(0.5) update_progress( 1, "Searching [%d]" % (num_hits.value - num_hits_suppressed.value)) # Wait for consumers dumping results to finish too while (num_hits.value > progConsumers.value and progConsumers.value >= 0): logger.debug("Consuming hit: %d / %d" % (progConsumers.value, num_hits.value)) update_progress( min(1, float(progConsumers.value) / float(num_hits.value)), "Dumping results to disk [%d]" % progConsumers.value) time.sleep(0.5) # Make sure we dumped as many hits as we found assert (num_hits.value == progConsumers.value) update_progress(1, "Dumping results to disk [%d]" % progConsumers.value) # Track Consumers deaths logger.debug("Waiting for consumer reverse-poison pills") while num_consumers > 0: tmp = resultsConsumers.get() # Check for reverse-poison pill if tmp is None: num_consumers -= 1 logger.debug("Consumer finished!") logger.debug("All consumers accounted for") # Wait for consumer threads to finish logger.debug("Waiting for consumer threads to finish") for consumer in consumers: consumer.join() logger.debug("Consumer threads finished") # Print hit histogram: results = [] results.append(('cyan', ("Hit histogram:", "", ""))) while not hitHistogram_queue.empty(): (name, regex, regex_hits) = hitHistogram_queue.get() results.append(('white', (name, regex, regex_hits))) if len(results) > 1: outputcolum(results) # Stop timer t1 = time.time() logger.info("Search hits: %d" % num_hits.value) logger.info("Suppresed duplicate hits: %d" % num_hits_suppressed.value) logger.info("Search time: %s" % (str(timedelta(seconds=(t1 - t0))))) if num_hits.value: logger.info("Head:") # Dump head of output file: num_lines = file_size(options.outputFile) from itertools import islice with open(options.outputFile) as myfile: head = list(islice(myfile, 5)) for line in head: logger.info(line.strip('\n\r')) logger.info("(%d lines suppressed)" % max(0, (num_lines - 5))) return (num_hits.value, num_hits_suppressed.value, results)
def GetIDForHosts(fileFullPathList, DB): # Returns: (filePath, instanceID, hostname, hostID, ingest_type) hostsTest = {} hostsProcess = [] progress_total = 0 progress_current = 0 # Determine plugin_type and hostname for file_name_fullpath in fileFullPathList: hostName = None ingest_type = None loop_counter = 0 while True: if loop_counter > len(ingest_plugins_types_stack): # We ignore empty file from hosts with no appcompat data # todo: Omit suppression on verbose mode tmp_file_size = file_size(file_name_fullpath) if tmp_file_size > 500: logger.warning("No ingest plugin could process: %s (skipping file) [size: %d]" % (ntpath.basename(file_name_fullpath), tmp_file_size)) break ingest_type = ingest_plugins_types_stack[0] if ingest_plugins[ingest_type].matchFileNameFilter(file_name_fullpath): # Check magic: try: magic_check = ingest_plugins[ingest_type].checkMagic(file_name_fullpath) if isinstance(magic_check, tuple): logger.error("Report bug") else: magic_check_res = magic_check if magic_check_res: # Magic OK, go with this plugin hostName = ingest_plugins[ingest_type].getHostName(file_name_fullpath) break except Exception as e: logger.exception("Error processing: %s (%s)" % (file_name_fullpath, str(e))) # Emulate stack with list to minimize internal looping (place last used plugin at the top) ingest_plugins_types_stack.remove(ingest_type) ingest_plugins_types_stack.insert(len(ingest_plugins_types_stack), ingest_type) loop_counter += 1 if hostName is not None: if hostName in hostsTest: hostsTest[hostName].append((file_name_fullpath, ingest_plugins[ingest_type])) else: hostsTest[hostName] = [] hostsTest[hostName].append((file_name_fullpath, ingest_plugins[ingest_type])) progress_total = len(hostsTest.keys()) # Iterate over hosts. If host exists in DB grab rowID else create and grab rowID. conn = DB.appGetConn() with closing(conn.cursor()) as c: for hostName in hostsTest.keys(): assert(hostName) logger.debug("Processing host: %s" % hostName) # Check if Host exists c.execute("SELECT count(*) FROM Hosts WHERE HostName = '%s'" % hostName) data = c.fetchone()[0] if (data != 0): # Host already has at least one instance in the DB c.execute("SELECT HostID, Instances FROM Hosts WHERE HostName = '%s'" % hostName) data = c.fetchone() tmpHostID = data[0] tmpInstances = eval(data[1]) for (file_fullpath, ingest_plugin) in hostsTest[hostName]: logger.debug("Grabbing instanceID from file: %s" % file_fullpath) try: instance_ID = CalculateInstanceID(file_fullpath, ingest_plugin) except Exception: logger.error("Error parsing: %s (skipping)" % file_fullpath) traceback.print_exc(file=sys.stdout) else: if str(instance_ID) not in tmpInstances: tmpInstances.append(str(instance_ID)) hostsProcess.append((file_fullpath, instance_ID, hostName, tmpHostID, ingest_plugin)) else: logger.debug("Duplicate host and instance found: %s" %hostName) continue # Save updated Instances list c.execute("UPDATE Hosts SET Instances = %s, InstancesCounter = %d WHERE HostName = '%s'" % ('"' + str(repr(tmpInstances)) + '"', len(tmpInstances), hostName)) else: # Host does not exist. Add instance and grab the host ID. tmpInstances = [] newInstances = [] for (file_fullpath, ingest_plugin) in hostsTest[hostName]: try: instance_ID = CalculateInstanceID(file_fullpath, ingest_plugin) except Exception: logger.error("Error parsing: %s (skipping)" % file_fullpath) traceback.print_exc(file=sys.stdout) else: if str(instance_ID) not in tmpInstances: tmpInstances.append(str(instance_ID)) newInstances.append((file_fullpath, instance_ID, ingest_plugin)) c.execute("INSERT INTO Hosts VALUES (NULL,%s,%s,%d,%d,%d)" % ('"' + hostName + '"', '"' + str(repr(tmpInstances)) + '"', len(tmpInstances), 0, 0)) tmpHostID = c.lastrowid for (file_fullpath, instance_ID, ingest_plugin) in newInstances: # todo: Do we want/need each row to track from what instance it came? hostsProcess.append((file_fullpath, instance_ID, hostName, tmpHostID, ingest_plugin)) # Update progress progress_current += 1 if settings.logger_getDebugMode(): status_extra_data = " [RAM: %d%%]" % psutil_phymem_usage() else: status_extra_data = "" # logger.debug("Pre-process new hosts/instances%s" % status_extra_data) logger.info(update_progress(min(1, float(progress_current) / float(progress_total)), "Calculate ID's for new hosts/instances%s" % status_extra_data, True)) conn.commit() # Return hosts to be processed return hostsProcess