def test_minio(): """ Test Minio FileStore by pushing and fetching back content from it. """ content = b"THIS IS A MINIO TEST" fs = FileStore('s3://al_storage_key:Ch@ngeTh!sPa33w0rd@localhost:9000/?s3_bucket=test&use_ssl=False') assert fs.delete('al4_minio_pytest.txt') is None assert fs.put('al4_minio_pytest.txt', content) != [] assert fs.exists('al4_minio_pytest.txt') != [] assert fs.get('al4_minio_pytest.txt') == content assert fs.delete('al4_minio_pytest.txt') is None
class CacheStore(object): def __init__(self, component, config=None, datastore=None): if not component: raise ValueError("Cannot instanciate a cachestore without providing a component name.") if not COMPONENT_VALIDATOR.match(component): raise ValueError("Invalid component name. (Only letters, numbers, underscores and dots allowed)") if config is None: config = forge.get_config() self.component = component self.datastore = datastore or forge.get_datastore() self.filestore = FileStore(*config.filestore.cache) def __enter__(self): return self def __exit__(self, ex_type, exc_val, exc_tb): self.filestore.close() def save(self, cache_key, data, ttl=DEFAULT_CACHE_LEN): if not COMPONENT_VALIDATOR.match(cache_key): raise ValueError("Invalid cache_key for cache item. " "(Only letters, numbers, underscores and dots allowed)") new_key = f"{self.component}_{cache_key}" if self.component else cache_key self.datastore.cached_file.save(new_key, {'expiry_ts': now_as_iso(ttl), 'component': self.component}) self.filestore.put(new_key, data) def get(self, cache_key): new_key = f"{self.component}_{cache_key}" if self.component else cache_key return self.filestore.get(new_key) def delete(self, cache_key, db_delete=True): new_key = f"{self.component}_{cache_key}" if self.component else cache_key self.filestore.delete(new_key) if db_delete: self.datastore.cached_file.delete(new_key)
class CacheStore(object): def __init__(self, component: str, config=None, datastore=None): if not component: raise ValueError("Cannot instantiate a cachestore without providing a component name.") if not COMPONENT_VALIDATOR.match(component): raise ValueError("Invalid component name. (Only letters, numbers, underscores and dots allowed)") if config is None: config = forge.get_config() self.component = component self.datastore = datastore or forge.get_datastore(config=config) self.filestore = FileStore(*config.filestore.cache) def __enter__(self) -> 'CacheStore': return self def __exit__(self, ex_type, exc_val, exc_tb): self.filestore.close() def save(self, cache_key: str, data: AnyStr, ttl=DEFAULT_CACHE_LEN, force=False): if not COMPONENT_VALIDATOR.match(cache_key): raise ValueError("Invalid cache_key for cache item. " "(Only letters, numbers, underscores and dots allowed)") new_key = f"{self.component}_{cache_key}" if self.component else cache_key self.datastore.cached_file.save(new_key, {'expiry_ts': now_as_iso(ttl), 'component': self.component}) self.filestore.put(new_key, data, force=force) def upload(self, cache_key: str, path: str, ttl=DEFAULT_CACHE_LEN): if not COMPONENT_VALIDATOR.match(cache_key): raise ValueError("Invalid cache_key for cache item. " "(Only letters, numbers, underscores and dots allowed)") new_key = f"{self.component}_{cache_key}" if self.component else cache_key self.datastore.cached_file.save(new_key, {'expiry_ts': now_as_iso(ttl), 'component': self.component}) self.filestore.upload(new_key, path, force=True) def touch(self, cache_key: str, ttl=DEFAULT_CACHE_LEN): if not COMPONENT_VALIDATOR.match(cache_key): raise ValueError("Invalid cache_key for cache item. " "(Only letters, numbers, underscores and dots allowed)") if not self.exists(cache_key): raise KeyError(cache_key) new_key = f"{self.component}_{cache_key}" if self.component else cache_key self.datastore.cached_file.save(new_key, {'expiry_ts': now_as_iso(ttl), 'component': self.component}) def get(self, cache_key: str) -> Optional[bytes]: new_key = f"{self.component}_{cache_key}" if self.component else cache_key return self.filestore.get(new_key) def download(self, cache_key: str, path: str): new_key = f"{self.component}_{cache_key}" if self.component else cache_key return self.filestore.download(new_key, path) def exists(self, cache_key: str): new_key = f"{self.component}_{cache_key}" if self.component else cache_key return self.filestore.exists(new_key) def delete(self, cache_key: str, db_delete=True): new_key = f"{self.component}_{cache_key}" if self.component else cache_key self.filestore.delete(new_key) if db_delete: self.datastore.cached_file.delete(new_key)
class ExpiryManager(ServerBase): def __init__(self, force_ilm=False): self.config = forge.get_config() if force_ilm: self.config.datastore.ilm.enabled = True super().__init__('assemblyline.expiry', shutdown_timeout=self.config.core.expiry.sleep_time + 5) self.datastore = forge.get_datastore(config=self.config, archive_access=True) self.hot_datastore = forge.get_datastore(config=self.config, archive_access=False) self.filestore = forge.get_filestore(config=self.config) self.cachestore = FileStore(*self.config.filestore.cache) self.expirable_collections = [] self.archiveable_collections = [] self.counter = MetricsFactory('expiry', Metrics) self.counter_archive = MetricsFactory('archive', Metrics) if self.config.datastore.ilm.enabled: self.fs_hashmap = { 'file': self.archive_filestore_delete, 'cached_file': self.archive_cachestore_delete } else: self.fs_hashmap = { 'file': self.filestore_delete, 'cached_file': self.cachestore_delete } for name, definition in self.datastore.ds.get_models().items(): if hasattr(definition, 'archive_ts'): self.archiveable_collections.append( getattr(self.datastore, name)) if hasattr(definition, 'expiry_ts'): self.expirable_collections.append(getattr( self.datastore, name)) if self.config.core.metrics.apm_server.server_url is not None: self.log.info( f"Exporting application metrics to: {self.config.core.metrics.apm_server.server_url}" ) elasticapm.instrument() self.apm_client = elasticapm.Client( server_url=self.config.core.metrics.apm_server.server_url, service_name="expiry") else: self.apm_client = None def stop(self): if self.counter: self.counter.stop() if self.apm_client: elasticapm.uninstrument() super().stop() def filestore_delete(self, sha256, _): self.filestore.delete(sha256) def archive_filestore_delete(self, sha256, expiry_time): # If we are working with an archive, their may be a hot entry that expires later. doc = self.hot_datastore.file.get_if_exists(sha256, as_obj=False) if doc and doc['expiry_ts'] > expiry_time: return self.filestore.delete(sha256) def cachestore_delete(self, sha256, _): self.filestore.delete(sha256) def archive_cachestore_delete(self, sha256, expiry_time): doc = self.hot_datastore.cached_file.get_if_exists(sha256, as_obj=False) if doc and doc['expiry_ts'] > expiry_time: return self.cachestore.delete(sha256) def run_expiry_once(self): now = now_as_iso() reached_max = False # Expire data for collection in self.expirable_collections: # Call heartbeat pre-dated by 5 minutes. If a collection takes more than # 5 minutes to expire, this container could be seen as unhealthy. The down # side is if it is stuck on something it will be more than 5 minutes before # the container is restarted. self.heartbeat(int(time.time() + 5 * 60)) # Start of expiry transaction if self.apm_client: self.apm_client.begin_transaction("Delete expired documents") if self.config.core.expiry.batch_delete: computed_date = epoch_to_iso( dm(f"{now}||-{self.config.core.expiry.delay}h/d"). float_timestamp) else: computed_date = epoch_to_iso( dm(f"{now}||-{self.config.core.expiry.delay}h"). float_timestamp) delete_query = f"expiry_ts:[* TO {computed_date}]" if self.config.core.expiry.delete_storage and collection.name in self.fs_hashmap: file_delete = True sort = ["expiry_ts asc", "id asc"] else: file_delete = False sort = None number_to_delete = collection.search( delete_query, rows=0, as_obj=False, use_archive=True, sort=sort, track_total_hits=EXPIRY_SIZE)['total'] if self.apm_client: elasticapm.label(query=delete_query) elasticapm.label(number_to_delete=number_to_delete) self.log.info(f"Processing collection: {collection.name}") if number_to_delete != 0: if file_delete: with elasticapm.capture_span( name='FILESTORE [ThreadPoolExecutor] :: delete()', labels={ "num_files": number_to_delete, "query": delete_query }): # Delete associated files with concurrent.futures.ThreadPoolExecutor( self.config.core.expiry.workers, thread_name_prefix="file_delete") as executor: for item in collection.search( delete_query, fl='id', rows=number_to_delete, sort=sort, use_archive=True, as_obj=False)['items']: executor.submit( self.fs_hashmap[collection.name], item['id'], computed_date) self.log.info( f' Deleted associated files from the ' f'{"cachestore" if "cache" in collection.name else "filestore"}...' ) # Proceed with deletion collection.delete_by_query( delete_query, workers=self.config.core.expiry.workers, sort=sort, max_docs=number_to_delete) else: # Proceed with deletion collection.delete_by_query( delete_query, workers=self.config.core.expiry.workers) if number_to_delete == EXPIRY_SIZE: reached_max = True self.counter.increment(f'{collection.name}', increment_by=number_to_delete) self.log.info( f" Deleted {number_to_delete} items from the datastore..." ) else: self.log.debug(" Nothing to delete in this collection.") # End of expiry transaction if self.apm_client: self.apm_client.end_transaction(collection.name, 'deleted') return reached_max def run_archive_once(self): reached_max = False if not self.config.datastore.ilm.enabled: return reached_max now = now_as_iso() # Archive data for collection in self.archiveable_collections: # Call heartbeat pre-dated by 5 minutes. If a collection takes more than # 5 minutes to expire, this container could be seen as unhealthy. The down # side is if it is stuck on something it will be more than 5 minutes before # the container is restarted. self.heartbeat(int(time.time() + 5 * 60)) # Start of expiry transaction if self.apm_client: self.apm_client.begin_transaction("Archive older documents") archive_query = f"archive_ts:[* TO {now}]" sort = ["archive_ts asc", "id asc"] number_to_archive = collection.search( archive_query, rows=0, as_obj=False, use_archive=False, sort=sort, track_total_hits=ARCHIVE_SIZE)['total'] if number_to_archive == ARCHIVE_SIZE: reached_max = True if self.apm_client: elasticapm.label(query=archive_query) elasticapm.label(number_to_archive=number_to_archive) self.log.info(f"Processing collection: {collection.name}") if number_to_archive != 0: # Proceed with archiving if collection.archive(archive_query, max_docs=number_to_archive, sort=sort): self.counter_archive.increment( f'{collection.name}', increment_by=number_to_archive) self.log.info( f" Archived {number_to_archive} documents...") else: self.log.warning( f" Failed to properly archive {number_to_archive} documents..." ) else: self.log.debug(" Nothing to archive in this collection.") # End of expiry transaction if self.apm_client: self.apm_client.end_transaction(collection.name, 'archived') return reached_max def try_run(self): while self.running: expiry_maxed_out = False archive_maxed_out = False try: expiry_maxed_out = self.run_expiry_once() except Exception as e: self.log.exception(str(e)) try: archive_maxed_out = self.run_archive_once() except Exception as e: self.log.exception(str(e)) if not expiry_maxed_out and not archive_maxed_out: self.sleep_with_heartbeat(self.config.core.expiry.sleep_time)