def appLoadMP(pathToLoad, dbfilenameFullPath, maxCores, governorOffFlag): global _tasksPerJob files_to_process = [] conn = None # Start timer t0 = datetime.now() logger.debug("Starting appLoadMP") # Calculate aggreagate file_filter for all ingest types supported: file_filter = '|'.join([v.getFileNameFilter() for k,v in ingest_plugins.iteritems()]) # Add zip extension file_filter += "|.*\.zip" # Check if we're loading Redline data if os.path.isdir(pathToLoad) and os.path.basename(pathToLoad).lower() == 'RedlineAudits'.lower(): files_to_process = searchRedLineAudits(pathToLoad) else: # Search for all files to be processed if os.path.isdir(pathToLoad): files_to_process = searchFolders(pathToLoad, file_filter) else: files_to_process = processArchives(pathToLoad, file_filter) if files_to_process: # Init DB if required DB = appDB.DBClass(dbfilenameFullPath, True, settings.__version__) conn = DB.appConnectDB() # Extract hostnames, grab existing host IDs from DB and calculate instance ID for new IDs to be ingested: instancesToProcess = [] instancesToProcess += GetIDForHosts(files_to_process, DB) countInstancesToProcess = len(instancesToProcess) logger.info("Found %d new instances" % (countInstancesToProcess)) # Setup producers/consumers initial counts num_consumers = 1 num_producers = 1 # Setup MPEngine mpe = MPEngineProdCons(maxCores, appLoadProd, appLoadCons, governorOffFlag) # Reduce _tasksPerJob for small jobs if countInstancesToProcess < _tasksPerJob: _tasksPerJob = 1 # Create task list task_list = [] instancesPerJob = _tasksPerJob num_tasks = 0 for chunk in chunks(instancesToProcess, instancesPerJob): # todo: We no longer need pathToLoad as tasks include the fullpath now task_list.append(Task(pathToLoad, chunk)) num_tasks += 1 if num_tasks > 0: # Check if we have to drop indexes to speedup insertions # todo: Research ratio of existing hosts to new hosts were this makes sense if countInstancesToProcess > 1000 or DB.CountHosts() < 1000: DB.appDropIndexesDB() # Queue tasks for Producers mpe.addTaskList(task_list) # Start procs mpe.startProducers(num_producers) mpe.startConsumers(num_consumers, [dbfilenameFullPath]) # mpe.addProducer() # Control loop while mpe.working(): time.sleep(1.0) (num_producers,num_consumers,num_tasks,progress_producers,progress_consumers) = mpe.getProgress() elapsed_time = datetime.now() - t0 mean_loadtime_per_host = (elapsed_time) / max(1, _tasksPerJob * progress_consumers) pending_hosts = ((num_tasks * _tasksPerJob) - (_tasksPerJob * progress_consumers)) etr = (mean_loadtime_per_host * pending_hosts) eta = t0 + elapsed_time + etr ett = (eta - t0) if settings.logger_getDebugMode(): status_extra_data = " Prod: %s Cons: %s (%d -> %d -> %d: %d) [RAM: %d%% / Obj: %d / ETH: %s / ETA: %s / ETT: %s]" % \ (num_producers, num_consumers, num_tasks, progress_producers, progress_consumers, progress_producers - progress_consumers, psutil_phymem_usage(), len(gc.get_objects()), mean_loadtime_per_host if progress_consumers * _tasksPerJob > 100 else "N/A", str(eta.time()).split(".")[0] if progress_consumers * _tasksPerJob > 100 else "N/A", str(ett).split(".")[0] if progress_consumers * _tasksPerJob > 100 else "N/A") else: status_extra_data = "" # logger.info("Parsing files%s" % status_extra_data) logger.info(update_progress(min(1,float(progress_consumers) / float(num_tasks)), "Parsing files%s" % status_extra_data, True)) mpe.rebalance() del mpe # Stop timer elapsed_time = datetime.now() - t0 mean_loadtime_per_host = (elapsed_time) / max(1, countInstancesToProcess) logger.info("Load speed: %s seconds / file" % (mean_loadtime_per_host)) logger.info("Load time: %s" % (str(elapsed_time).split(".")[0])) else: logger.info("Found no files to process!")
def GetIDForHosts(fileFullPathList, DB): # Returns: (filePath, instanceID, hostname, hostID, ingest_type) hostsTest = {} hostsProcess = [] progress_total = 0 progress_current = 0 # Determine plugin_type and hostname for file_name_fullpath in fileFullPathList: hostName = None ingest_type = None loop_counter = 0 while True: if loop_counter > len(ingest_plugins_types_stack): # We ignore empty file from hosts with no appcompat data # todo: Omit suppression on verbose mode tmp_file_size = file_size(file_name_fullpath) if tmp_file_size > 500: logger.warning("No ingest plugin could process: %s (skipping file) [size: %d]" % (ntpath.basename(file_name_fullpath), tmp_file_size)) break ingest_type = ingest_plugins_types_stack[0] if ingest_plugins[ingest_type].matchFileNameFilter(file_name_fullpath): # Check magic: try: magic_check = ingest_plugins[ingest_type].checkMagic(file_name_fullpath) if isinstance(magic_check, tuple): logger.error("Report bug") else: magic_check_res = magic_check if magic_check_res: # Magic OK, go with this plugin hostName = ingest_plugins[ingest_type].getHostName(file_name_fullpath) break except Exception as e: logger.exception("Error processing: %s (%s)" % (file_name_fullpath, str(e))) # Emulate stack with list to minimize internal looping (place last used plugin at the top) ingest_plugins_types_stack.remove(ingest_type) ingest_plugins_types_stack.insert(len(ingest_plugins_types_stack), ingest_type) loop_counter += 1 if hostName is not None: if hostName in hostsTest: hostsTest[hostName].append((file_name_fullpath, ingest_plugins[ingest_type])) else: hostsTest[hostName] = [] hostsTest[hostName].append((file_name_fullpath, ingest_plugins[ingest_type])) progress_total = len(hostsTest.keys()) # Iterate over hosts. If host exists in DB grab rowID else create and grab rowID. conn = DB.appGetConn() with closing(conn.cursor()) as c: for hostName in hostsTest.keys(): assert(hostName) logger.debug("Processing host: %s" % hostName) # Check if Host exists c.execute("SELECT count(*) FROM Hosts WHERE HostName = '%s'" % hostName) data = c.fetchone()[0] if (data != 0): # Host already has at least one instance in the DB c.execute("SELECT HostID, Instances FROM Hosts WHERE HostName = '%s'" % hostName) data = c.fetchone() tmpHostID = data[0] tmpInstances = eval(data[1]) for (file_fullpath, ingest_plugin) in hostsTest[hostName]: logger.debug("Grabbing instanceID from file: %s" % file_fullpath) try: instance_ID = CalculateInstanceID(file_fullpath, ingest_plugin) except Exception: logger.error("Error parsing: %s (skipping)" % file_fullpath) traceback.print_exc(file=sys.stdout) else: if str(instance_ID) not in tmpInstances: tmpInstances.append(str(instance_ID)) hostsProcess.append((file_fullpath, instance_ID, hostName, tmpHostID, ingest_plugin)) else: logger.debug("Duplicate host and instance found: %s" %hostName) continue # Save updated Instances list c.execute("UPDATE Hosts SET Instances = %s, InstancesCounter = %d WHERE HostName = '%s'" % ('"' + str(repr(tmpInstances)) + '"', len(tmpInstances), hostName)) else: # Host does not exist. Add instance and grab the host ID. tmpInstances = [] newInstances = [] for (file_fullpath, ingest_plugin) in hostsTest[hostName]: try: instance_ID = CalculateInstanceID(file_fullpath, ingest_plugin) except Exception: logger.error("Error parsing: %s (skipping)" % file_fullpath) traceback.print_exc(file=sys.stdout) else: if str(instance_ID) not in tmpInstances: tmpInstances.append(str(instance_ID)) newInstances.append((file_fullpath, instance_ID, ingest_plugin)) c.execute("INSERT INTO Hosts VALUES (NULL,%s,%s,%d,%d,%d)" % ('"' + hostName + '"', '"' + str(repr(tmpInstances)) + '"', len(tmpInstances), 0, 0)) tmpHostID = c.lastrowid for (file_fullpath, instance_ID, ingest_plugin) in newInstances: # todo: Do we want/need each row to track from what instance it came? hostsProcess.append((file_fullpath, instance_ID, hostName, tmpHostID, ingest_plugin)) # Update progress progress_current += 1 if settings.logger_getDebugMode(): status_extra_data = " [RAM: %d%%]" % psutil_phymem_usage() else: status_extra_data = "" # logger.debug("Pre-process new hosts/instances%s" % status_extra_data) logger.info(update_progress(min(1, float(progress_current) / float(progress_total)), "Calculate ID's for new hosts/instances%s" % status_extra_data, True)) conn.commit() # Return hosts to be processed return hostsProcess