예제 #1
def build_fake_DB(hosts=10, seed=random.randint(0, 10000), database_file=None):
    hostnames_set = set()
    filePaths_dict = defaultdict(int)
    filePaths_dict_ID = 0
    filePaths_dict_ID_skip = 0


    if database_file == None:
        # Get temp db name for the test
        tempdb = tempfile.NamedTemporaryFile(suffix='.db',
        database_file = tempdb.name

    if os.path.isfile(database_file):
        logger.warning("Adding hosts to existing database")
        with appDB.DBClass(database_file, "False", settings.__version__) as DB:
            conn = DB.appConnectDB()
            # Load existing hosts
            data = DB.Query("SELECT HostName FROM Hosts")
            for hostName in data:
            # Load existing paths
            data = DB.Query("SELECT FilePathID, FilePath FROM FilePaths")
            for filePathID, FilePath in data:
                filePaths_dict[FilePath] = (filePathID)
                filePaths_dict_ID += 1
            filePaths_dict_ID_skip = filePaths_dict_ID

        with appDB.DBClass(database_file, "True", settings.__version__) as DB:
            conn = DB.appConnectDB()
                "CREATE INDEX index_EntriesHostName on Hosts(HostName)")
                "CREATE INDEX index_FilePathsFilePath on FilePaths(FilePath)")

    with appDB.DBClass(database_file, "False", settings.__version__) as DB:
        conn = DB.appConnectDB()

        # Start creating hosts and data:
        rowList = []
        insertList = []
        numFields = 29 - 3
        valuesQuery = "(NULL," + "?," * numFields + "0, 0)"

        progressCurrent = 0
        progressTotal = hosts
        for i in xrange(0, hosts):
            progressCurrent += 1
            update_progress(float(progressCurrent) / float(progressTotal))

            HostName = ""
            while True:
                HostName = strip_accents(
                    (fake_ES.color_name() + fake_ES.country()).replace(
                        ' ', ''))
                HostName = strip_non_ascii(HostName)
                HostName += "_" + str(random.randint(000, 999))
                if HostName not in hostnames_set:

            print "Creating appcompat/amcache data for host: %s" % HostName
            Instances = ['dummy']
            InstancesCounter = 1
            Recon = 0
            ReconScoring = 0

            DB.ExecuteMany("INSERT INTO Hosts VALUES (NULL,?,?,?,?,?)",
                           [(HostName, str(repr(Instances)), InstancesCounter,
                             Recon, ReconScoring)])
            HostID = DB.Query(
                "SELECT HostID FROM Hosts WHERE HostName = '%s'" %

            # Sampled 2K hosts, this should statistically provide a somewhat realistic amount of entries (for AppCompat)
            for i in xrange(1, random.randint(400, 800)):
                # EntryType = random.choice([settings.__APPCOMPAT__,settings.__AMCACHE__])
                EntryType = settings.__APPCOMPAT__
                RowNumber = 0
                LastModified = str(fake.date_time_between('-1y')) + "." + str(
                    random.randint(1, 9999))
                LastUpdate = str(fake.date_time_between('-4y')) + "." + str(
                    random.randint(1, 9999))
                filePathID = 0
                # todo: FilePath retains final backslash on root paths (c:\, d:\ ...) remove.
                FilePath, FileName = ntpath.split(fake.path())
                FilePath = FilePath.lower()
                FileName = FileName.lower()
                Size = random.randint(1, 100000)
                if EntryType == settings.__APPCOMPAT__:
                    ExecFlag = random.choice(['True', 'False'])
                    ExecFlag = 'True'

                if EntryType == settings.__AMCACHE__:
                    SHA1 = fake.sha1()
                    FileDescription = random.choice(
                        ['', '', '', '', '', '', '', '', '', '',
                    FirstRun = str(fake.date_time_between('-1y')) + "." + str(
                        random.randint(1, 9999))
                    Created = str(fake.date_time_between('-5y')) + "." + str(
                        random.randint(1, 9999))
                    Modified1 = str(fake.date_time_between('-5y')) + "." + str(
                        random.randint(1, 9999))
                    Modified2 = str(fake.date_time_between('-5y')) + "." + str(
                        random.randint(1, 9999))
                    LinkerTS = str(fake.date_time_between('-10y'))
                    Company = fake.company()
                    PE_sizeofimage = random.randint(1, 10000)

                    # Redo re-assignment of date we do on load for AmCache
                    LastUpdate = FirstRun
                    LastModified = Modified2
                    SHA1 = ''
                    FileDescription = ''
                    FirstRun = ''
                    Created = ''
                    Modified1 = ''
                    Modified2 = ''
                    LinkerTS = ''
                    Company = ''
                    PE_sizeofimage = ''

                Product = 0
                Version_number = 0
                Version = 0
                Language = 0
                Header_hash = 0
                PE_checksum = 0
                SwitchBackContext = 0
                InstanceID = 0

                # # Add FilePath if not there yet
                # DB.Execute("INSERT OR IGNORE INTO FilePaths VALUES (NULL, '%s')" % FilePath)
                # # Get FilePathID
                # FilePathID = DB.QueryInt("SELECT FilePathID FROM FilePaths WHERE FilePath = '%s'" % FilePath)
                if FilePath not in filePaths_dict:
                    filePaths_dict[FilePath] = (filePaths_dict_ID)
                    filePathID = filePaths_dict_ID
                    filePaths_dict_ID += 1
                    filePathID = filePaths_dict[FilePath]

                    (HostID, EntryType, RowNumber, LastModified, LastUpdate,
                     filePathID, FileName, Size, ExecFlag, SHA1,
                     FileDescription, FirstRun, Created, Modified1, Modified2,
                     LinkerTS, Product, Company, PE_sizeofimage,
                     Version_number, Version, Language, Header_hash,
                     PE_checksum, SwitchBackContext, InstanceID))

                # Dump every now and then:
                if len(insertList) > 1000000:
                    logger.info("Dumping data to DB")
                    DB.ExecuteMany("INSERT INTO Entries VALUES " + valuesQuery,
                    insertList = []

        # Insert last bucket
        logger.info("Dumping last bucket to DB")
        DB.ExecuteMany("INSERT INTO Entries VALUES " + valuesQuery, insertList)

        # Insert new FilePaths
        list_FilePath_ID = [(v, k) for k, v in filePaths_dict.items()]
        list_FilePath_ID.sort(key=lambda tup: tup[0])
        DB.ExecuteMany("INSERT INTO FilePaths VALUES (?,?)",

    return database_file
예제 #2
def appLoadMP(pathToLoad, dbfilenameFullPath, maxCores, governorOffFlag):
    global _tasksPerJob

    files_to_process = []
    conn = None

    # Start timer
    t0 = datetime.now()

    logger.debug("Starting appLoadMP")
    # Calculate aggreagate file_filter for all ingest types supported:
    file_filter = '|'.join([v.getFileNameFilter() for k,v in ingest_plugins.iteritems()])
    # Add zip extension
    file_filter += "|.*\.zip"

    # Check if we're loading Redline data
    if os.path.isdir(pathToLoad) and os.path.basename(pathToLoad).lower() == 'RedlineAudits'.lower():
        files_to_process = searchRedLineAudits(pathToLoad)
        # Search for all files to be processed
        if os.path.isdir(pathToLoad):
            files_to_process = searchFolders(pathToLoad, file_filter)
            files_to_process = processArchives(pathToLoad, file_filter)

    if files_to_process:
        # Init DB if required
        DB = appDB.DBClass(dbfilenameFullPath, True, settings.__version__)
        conn = DB.appConnectDB()

        # Extract hostnames, grab existing host IDs from DB and calculate instance ID for new IDs to be ingested:
        instancesToProcess = []
        instancesToProcess += GetIDForHosts(files_to_process, DB)
        countInstancesToProcess = len(instancesToProcess)
        logger.info("Found %d new instances" % (countInstancesToProcess))

        # Setup producers/consumers initial counts
        num_consumers = 1
        num_producers = 1

        # Setup MPEngine
        mpe = MPEngineProdCons(maxCores, appLoadProd, appLoadCons, governorOffFlag)

        # Reduce _tasksPerJob for small jobs
        if countInstancesToProcess < _tasksPerJob: _tasksPerJob = 1

        # Create task list
        task_list = []
        instancesPerJob = _tasksPerJob
        num_tasks = 0
        for chunk in chunks(instancesToProcess, instancesPerJob):
            # todo: We no longer need pathToLoad as tasks include the fullpath now
            task_list.append(Task(pathToLoad, chunk))
            num_tasks += 1

        if num_tasks > 0:
            # Check if we have to drop indexes to speedup insertions
            # todo: Research ratio of existing hosts to new hosts were this makes sense
            if countInstancesToProcess > 1000 or DB.CountHosts() < 1000:

            # Queue tasks for Producers

            # Start procs
            mpe.startConsumers(num_consumers, [dbfilenameFullPath])
            # mpe.addProducer()

            # Control loop
            while mpe.working():
                (num_producers,num_consumers,num_tasks,progress_producers,progress_consumers) = mpe.getProgress()
                elapsed_time = datetime.now() - t0
                mean_loadtime_per_host = (elapsed_time) / max(1, _tasksPerJob * progress_consumers)
                pending_hosts = ((num_tasks * _tasksPerJob) - (_tasksPerJob * progress_consumers))
                etr = (mean_loadtime_per_host * pending_hosts)
                eta = t0 + elapsed_time + etr
                ett = (eta - t0)
                if settings.logger_getDebugMode(): status_extra_data = " Prod: %s Cons: %s (%d -> %d -> %d: %d) [RAM: %d%% / Obj: %d / ETH: %s / ETA: %s / ETT: %s]" % \
                                                                       (num_producers, num_consumers, num_tasks, progress_producers, progress_consumers, progress_producers - progress_consumers,
                     psutil_phymem_usage(), len(gc.get_objects()),
                     mean_loadtime_per_host if progress_consumers * _tasksPerJob > 100 else "N/A",
                     str(eta.time()).split(".")[0] if progress_consumers * _tasksPerJob > 100 else "N/A",
                     str(ett).split(".")[0] if progress_consumers * _tasksPerJob > 100 else "N/A")
                else: status_extra_data = ""
                # logger.info("Parsing files%s" % status_extra_data)

                logger.info(update_progress(min(1,float(progress_consumers) / float(num_tasks)), "Parsing files%s" % status_extra_data, True))

            del mpe

        # Stop timer
        elapsed_time = datetime.now() - t0
        mean_loadtime_per_host = (elapsed_time) / max(1, countInstancesToProcess)
        logger.info("Load speed: %s seconds / file" % (mean_loadtime_per_host))
        logger.info("Load time: %s" % (str(elapsed_time).split(".")[0]))
        logger.info("Found no files to process!")
예제 #3
def appSearchMP(dbfilenameFullPath, searchType, search_space, options):
    (outputFile, maxCores) = (options.outputFile, options.maxCores)
    known_bad_data = None
    # Start timer
    t0 = time.time()

    DB = appDB.DBClass(dbfilenameFullPath, True, settings.__version__)
    conn = DB.appConnectDB()

    # If possible use the available indexes
    if hasattr(
            options, 'field_name'
    ) and searchType == 'LITERAL' and options.searchLiteral[0][0] not in [
            '=', '>', '<'
    ] and DB.appIndexExistsDB(options.field_name):
        num_hits = namedtuple('hits', 'value')
        num_hits_suppressed = namedtuple('hits', 'value')
        (num_hits.value, num_hits_suppressed.value,
         results) = runIndexedSearch(dbfilenameFullPath, search_space, options)

        # Get total number of entries to search
        entriesCount = DB.CountEntries()
        logger.debug("Total entries in search space: %d" % entriesCount)

        # Pre-load known_bad if required
        if searchType == 'KNOWNBAD':
            known_bad_data = LoadRegexBulkSearch(options.knownbad_file)

        # Establish communication queues
        tasks = multiprocessing.JoinableQueue()
        resultsProducers = multiprocessing.Queue()
        resultsConsumers = multiprocessing.Queue()
        hitHistogram_queue = multiprocessing.Queue()

        # Start producers/consumers
        num_consumers = 1
        num_producers = max(1, maxCores - 1)

        # Prep lock for progress update Producers
        progProducers = multiprocessing.Value('i', 0)
        # Prep lock for progress update Consumers
        progConsumers = multiprocessing.Value('i', 0)
        # Prep Consumers return values
        num_hits = multiprocessing.Value('i', 0)
        num_hits_suppressed = multiprocessing.Value('i', 0)

            'Using %d cores for searching / %d cores for dumping results' %
            (num_producers, num_consumers))

        # Queue tasks for Producers
        # Limit rowsPerJob to constrain memory use and ensure reasonable progress updates
        rowsPerJob = min((entriesCount / 8), 5000)
        logger.debug("RowsPerJob: %d" % rowsPerJob)
        num_tasks = 0
        for startingRowID in range(0, entriesCount - rowsPerJob, rowsPerJob):
            tasks.put(Task(startingRowID, rowsPerJob - 1))
                "Creating search job %d: [%d - %d]" %
                (num_tasks, startingRowID, startingRowID + rowsPerJob - 1))
            num_tasks += 1
        logger.debug("Creating search job %d: [%d - %d]" %
                     (num_tasks, num_tasks * (rowsPerJob),
                      ((num_tasks * rowsPerJob) +
                       (entriesCount - (num_tasks * (rowsPerJob) - 1)))))
        # Special consideration for the last one:
            Task(num_tasks * (rowsPerJob),
                 (entriesCount - ((num_tasks * rowsPerJob) - 1))))
        logger.debug("Number of tasks: %d" % num_tasks)

        # Add a poison pill for each producer
        for i in xrange(num_producers):

        # Start producer threads
        producers = [Producer(tasks, resultsProducers, dbfilenameFullPath, progProducers, num_consumers, \
                              searchType, search_space, options, num_hits, known_bad_data) for i in xrange(num_producers)]
        for producer in producers:
            producer.daemon = True  # Remove for debugging

        # Start consumer threads
        consumers = [Consumer(resultsProducers, resultsConsumers, progConsumers, num_producers, outputFile, \
                              dbfilenameFullPath, searchType, search_space, options, num_hits, \
                              num_hits_suppressed, hitHistogram_queue, known_bad_data) for i in xrange(num_consumers)]
        for consumer in consumers:
            consumer.daemon = True  # Remove for debugging

        # Producer progress loop
        while (num_tasks > progProducers.value and progProducers.value >= 0):
            logger.debug("Producer num_tasks: %d - v.value: %d" %
                         (num_tasks, progProducers.value))
                    float(progProducers.value) / float(num_tasks)),
                "Searching [%d]" %
                (num_hits.value - num_hits_suppressed.value))
            1, "Searching [%d]" % (num_hits.value - num_hits_suppressed.value))

        # Wait for consumers dumping results to finish too
        while (num_hits.value > progConsumers.value
               and progConsumers.value >= 0):
            logger.debug("Consuming hit: %d / %d" %
                         (progConsumers.value, num_hits.value))
                    float(progConsumers.value) / float(num_hits.value)),
                "Dumping results to disk [%d]" % progConsumers.value)

        # Make sure we dumped as many hits as we found
        assert (num_hits.value == progConsumers.value)
                        "Dumping results to disk [%d]" % progConsumers.value)

        # Track Consumers deaths
        logger.debug("Waiting for consumer reverse-poison pills")
        while num_consumers > 0:
            tmp = resultsConsumers.get()
            # Check for reverse-poison pill
            if tmp is None:
                num_consumers -= 1
                logger.debug("Consumer finished!")
        logger.debug("All consumers accounted for")

        # Wait for consumer threads to finish
        logger.debug("Waiting for consumer threads to finish")
        for consumer in consumers:
        logger.debug("Consumer threads finished")

        # Print hit histogram:
        results = []
        results.append(('cyan', ("Hit histogram:", "", "")))
        while not hitHistogram_queue.empty():
            (name, regex, regex_hits) = hitHistogram_queue.get()
            results.append(('white', (name, regex, regex_hits)))
        if len(results) > 1:

    # Stop timer
    t1 = time.time()

    logger.info("Search hits: %d" % num_hits.value)
    logger.info("Suppresed duplicate hits: %d" % num_hits_suppressed.value)
    logger.info("Search time: %s" % (str(timedelta(seconds=(t1 - t0)))))

    if num_hits.value:
        # Dump head of output file:
        num_lines = file_size(options.outputFile)
        from itertools import islice
        with open(options.outputFile) as myfile:
            head = list(islice(myfile, 5))
        for line in head:
        logger.info("(%d lines suppressed)" % max(0, (num_lines - 5)))

    return (num_hits.value, num_hits_suppressed.value, results)
예제 #4
def GetIDForHosts(fileFullPathList, DB):
    # Returns: (filePath, instanceID, hostname, hostID, ingest_type)
    hostsTest = {}
    hostsProcess = []
    progress_total = 0
    progress_current = 0

    # Determine plugin_type and hostname
    for file_name_fullpath in fileFullPathList:
        hostName = None
        ingest_type = None
        loop_counter = 0
        while True:
            if loop_counter > len(ingest_plugins_types_stack):
                # We ignore empty file from hosts with no appcompat data
                # todo: Omit suppression on verbose mode
                tmp_file_size = file_size(file_name_fullpath)
                if tmp_file_size > 500:
                    logger.warning("No ingest plugin could process: %s (skipping file) [size: %d]" %
                                   (ntpath.basename(file_name_fullpath), tmp_file_size))
            ingest_type = ingest_plugins_types_stack[0]
            if ingest_plugins[ingest_type].matchFileNameFilter(file_name_fullpath):
                # Check magic:
                    magic_check = ingest_plugins[ingest_type].checkMagic(file_name_fullpath)
                    if isinstance(magic_check, tuple):
                        logger.error("Report bug")
                    else: magic_check_res = magic_check
                    if magic_check_res:
                        # Magic OK, go with this plugin
                        hostName = ingest_plugins[ingest_type].getHostName(file_name_fullpath)
                except Exception as e:
                    logger.exception("Error processing: %s (%s)" % (file_name_fullpath, str(e)))
            # Emulate stack with list to minimize internal looping (place last used plugin at the top)
            ingest_plugins_types_stack.insert(len(ingest_plugins_types_stack), ingest_type)
            loop_counter += 1
        if hostName is not None:
            if hostName in hostsTest:
                hostsTest[hostName].append((file_name_fullpath, ingest_plugins[ingest_type]))
                hostsTest[hostName] = []
                hostsTest[hostName].append((file_name_fullpath, ingest_plugins[ingest_type]))

    progress_total = len(hostsTest.keys())
    # Iterate over hosts. If host exists in DB grab rowID else create and grab rowID.
    conn = DB.appGetConn()
    with closing(conn.cursor()) as c:
        for hostName in hostsTest.keys():
            logger.debug("Processing host: %s" % hostName)
            # Check if Host exists
            c.execute("SELECT count(*) FROM Hosts WHERE HostName = '%s'" % hostName)
            data = c.fetchone()[0]
            if (data != 0):
                # Host already has at least one instance in the DB
                c.execute("SELECT HostID, Instances FROM Hosts WHERE HostName = '%s'" % hostName)
                data = c.fetchone()
                tmpHostID = data[0]
                tmpInstances = eval(data[1])
                for (file_fullpath, ingest_plugin) in hostsTest[hostName]:
                    logger.debug("Grabbing instanceID from file: %s" % file_fullpath)
                        instance_ID = CalculateInstanceID(file_fullpath, ingest_plugin)
                    except Exception:
                        logger.error("Error parsing: %s (skipping)" % file_fullpath)
                        if str(instance_ID) not in tmpInstances:
                            hostsProcess.append((file_fullpath, instance_ID, hostName, tmpHostID, ingest_plugin))
                            logger.debug("Duplicate host and instance found: %s" %hostName)
                # Save updated Instances list
                c.execute("UPDATE Hosts SET Instances = %s, InstancesCounter = %d WHERE HostName = '%s'" % ('"' + str(repr(tmpInstances)) + '"', len(tmpInstances), hostName))
                # Host does not exist. Add instance and grab the host ID.
                tmpInstances = []
                newInstances = []
                for (file_fullpath, ingest_plugin) in hostsTest[hostName]:
                        instance_ID = CalculateInstanceID(file_fullpath, ingest_plugin)
                    except Exception:
                        logger.error("Error parsing: %s (skipping)" % file_fullpath)
                        if str(instance_ID) not in tmpInstances:
                            newInstances.append((file_fullpath, instance_ID, ingest_plugin))

                c.execute("INSERT INTO Hosts VALUES (NULL,%s,%s,%d,%d,%d)" % ('"' + hostName + '"', '"' + str(repr(tmpInstances)) + '"', len(tmpInstances), 0, 0))
                tmpHostID = c.lastrowid
                for (file_fullpath, instance_ID, ingest_plugin) in newInstances:
                    # todo: Do we want/need each row to track from what instance it came?
                    hostsProcess.append((file_fullpath, instance_ID, hostName, tmpHostID, ingest_plugin))
            # Update progress
            progress_current += 1
            if settings.logger_getDebugMode():
                status_extra_data = " [RAM: %d%%]" % psutil_phymem_usage()
            else: status_extra_data = ""
            # logger.debug("Pre-process new hosts/instances%s" % status_extra_data)
            logger.info(update_progress(min(1, float(progress_current) / float(progress_total)), "Calculate ID's for new hosts/instances%s" % status_extra_data, True))

    # Return hosts to be processed
    return hostsProcess