def __init__(self, batch_size=500, age=0): """Initialize the class. Args: batch_size: Number of files to read age: Minimum age of files to be read per batch Returns: None """ # Get cache directory config = Config() directory = config.agent_cache_directory(PATTOO_API_AGENT_NAME) self._batch_id = int(time.time() * 1000) # Read data from cache. Stop if there is no data found. self._data = files.read_json_files(directory, die=False, age=age, count=batch_size) # Save the number of files read self.files = len(self._data)
def process_cache(batch_size=500, max_duration=3600, fileage=10, script=False): """Ingest data. Args: batch_size: Number of files to process at a time max_duration: Maximum duration fileage: Minimum age of files to be processed in seconds Returns: success: True if successful Method: 1) Read the files in the cache directory older than a threshold 2) Process the data in the files 3) Repeat, if new files are found that are older than the threshold, or we have been running too long. Batches of files are read to reduce the risk of overloading available memory, and ensure we can exit if we are running too long. """ # Initialize key variables records = 0 start = time.time() looptime = 0 files_read = 0 success = True # Get cache directory config = Config() directory = config.agent_cache_directory(PATTOO_API_AGENT_NAME) # Log what we are doing log_message = 'Processing ingest cache.' log.log2info(20085, log_message) # Get the number of files in the directory files_found = len( [_ for _ in os.listdir(directory) if _.endswith('.json')]) # Create lockfile only if running as a script. # The daemon has its own locking mechanism if bool(script) is True: success = _lock() if bool(success) is False: return bool(success) # Process the files in batches to reduce the database connection count # This can cause errors while True: # Agents constantly update files. We don't want an infinite loop # situation where we always have files available that are newer than # the desired fileage. loopstart = time.time() fileage = fileage + looptime # Automatically stop if we are going on too long.(1 of 2) duration = loopstart - start if duration > max_duration: log_message = ('''\ Stopping ingester after exceeding the maximum runtime duration of {}s. \ This can be adjusted on the CLI.'''.format(max_duration)) log.log2info(20022, log_message) break # Automatically stop if we are going on too long.(2 of 2) if files_read >= files_found: # No need to log. This is an expected outcome. break # Read data from cache. Stop if there is no data found. cache = Cache(batch_size=batch_size, age=fileage) count = cache.ingest() # Automatically stop if we are going on too long.(2 of 2) if bool(cache.files) is False: # No need to log. This is an expected outcome. break # Get the records processed, looptime and files read records += count files_read += cache.files looptime = max(time.time() - loopstart, looptime) # Print result duration = time.time() - start if bool(records) is True and bool(duration) is True: log_message = ('''\ Agent cache ingest completed. {0} records processed in {1:.2f} seconds, \ {2:.2f} records / second. {3} files read. \ '''.format(records, duration, records / duration, files_read)) log.log2info(20084, log_message) else: log_message = 'No files found to ingest' log.log2info(20021, log_message) # Delete lockfile only if running as a script. # The daemon has its own locking mechanism if bool(script) is True: success = _lock(delete=True) # Log what we are doing log_message = 'Finished processing ingest cache.' log.log2info(20020, log_message) return bool(success)