def finalize(self, **kwargs): self.run_pool_attendant = False lock = threading.Lock() try: lock.acquire(blocking=True, timeout=10) for blob_writer_identity in self.writer_pool.writers: try: get_logger().debug( f"Removing from the writer pool during finalization, identity={blob_writer_identity['identity']}, poolsize={len(self.writer_pool.writers)}" ) self.writer_pool.remove_writer( blob_writer_identity["identity"]) except Exception as err: get_logger().debug( f"Error finalizing `{blob_writer_identity}`, {type(err).__name__} - {err}" ) finally: lock.release() return super().finalize()
def pool_attendant(self): """ Writer Pool Management """ while self.run_pool_attendant: lock = threading.Lock() try: lock.acquire(blocking=True, timeout=10) # search for pool occupants who haven't had a write recently for blob_writer_identity in self.writer_pool.get_stale_writers( self.idle_timeout_seconds): get_logger().debug( f"Evicting {blob_writer_identity} from the writer pool due to inactivity - limit is {self.idle_timeout_seconds} seconds, poolsize={len(self.writer_pool.writers)}" ) self.writer_pool.remove_writer(blob_writer_identity) # if we're over capacity, evict the LRU writers for (blob_writer_identity ) in self.writer_pool.nominate_writers_to_evict(): get_logger().debug( f"Evicting {blob_writer_identity} from the writer pool due the pool being over its {self.writer_pool_capacity} capacity, poolsize={len(self.writer_pool.writers)}" ) self.writer_pool.remove_writer(blob_writer_identity) finally: lock.release() time.sleep(0.1) get_logger().debug("Pool attendant off-duty")
def test_log_sanitizer(): """ caplog records the message before the formatter so can't be used to test sanitization of logs automatically. These are another set of smoke tests, these should run without error. """ logger = get_logger() logger.audit({"password": "******"}) logger.audit('{"password": "******"}') logger.audit("password:topsecret3") logger.audit(["password", "top secret 4"]) logger.debug({"alpha": "`1`", "beta": "`2`", "gamma": "'3'", "delta": "'4'"}) logger.debug("{'action':'list','limit':'1000'}")
def test_smoke_test(): """ This is just a smoke test, it exercises most of the logging functionality it should just work. """ logger = get_logger() logger.setLevel(LEVELS.DEBUG) logger.debug("debug") logger.warning("warn") logger.warning("warn") logger.warning("warn") logger.warning("warn") logger.warning("warned") logger.warning("warn") logger.warning("warn") logger.warning("warned") logger.warning("warn") logger.warning("warn") logger.warning("warn") logger.error("error") logger.alert("alert") logger.audit("audit")
def test_new_log_levels(caplog): """ caplog is a feature of pytest that allows logs to be captured and inspected. It is passed to a test function and has an attribute .record_tuples which is a stack of logging messages. To read the last item, pop it off the stack. It is in the form of: log name, log level, log message """ if caplog is None: # pragma: no cover print("unable to test logging interactively - use pytest") return logger = get_logger() logger.audit("this is a sample audit") res = caplog.record_tuples.pop() assert res == (LOG_NAME, LEVELS.AUDIT, "this is a sample audit"), res logger.alert("this is a sample alert") res = caplog.record_tuples.pop() assert res == (LOG_NAME, LEVELS.ALERT, "this is a sample alert"), res
import sys import os import time import statistics sys.path.insert(1, os.path.join(sys.path[0], "../..")) from mabel.data import BatchWriter from mabel.adapters.null import NullWriter from mabel.adapters.disk import DiskWriter from mabel.logging import get_logger from mabel.data.validator import Schema from mabel.data.internals import display, dictset import ujson as json logger = get_logger() logger.setLevel(100) schema_definition = { "description": "test data", "fields": [ { "name": "user_id", "type": "numeric" }, { "name": "user_name", "type": "string" }, {
raw_path=True, filters=("user_name", "==", username), ) res = [r for r in reader] print(len(res)) return (time.perf_counter_ns() - start) / 1e9 import os, sys sys.path.insert(1, os.path.join(sys.path[0], "../..")) from mabel.data import Reader from mabel.adapters.disk import DiskReader from mabel.logging import get_logger get_logger().setLevel(100) user_name = "Verizon Support" print("indexed\t:", time_it("tests/data/index/is", user_name)) print("not indexed\t:", time_it("tests/data/index/not", user_name)) print("indexed\t:", time_it("tests/data/index/is", user_name + "bb")) print("not indexed\t:", time_it("tests/data/index/not", user_name + "bb")) reader = Reader( inner_reader=DiskReader, dataset="tests/data/index/is", raw_path=True, ) idx = Index.build_index(reader, "user_name")
def append(self, record: Union[dict, BaseModel]): """ Append a new record to the Writer Parameters: record: dictionary The record to append to the Writer Returns: integer The number of records in the current blob """ # get the appropritate writer from the pool and append the record # the writer identity is the base of the path where the partitions # are written. # Check the new record conforms to the schema # unlike the batch writer, we don't want to bail out if we have a # problem here, instead we're going to save the file to a BACKOUT # partition writes = 0 identity = paths.date_format(self.dataset_template, datetime.date.today()) if isinstance(record, BaseModel): record = record.dict() elif self.schema and not self.schema.validate(subject=record, raise_exception=False): identity += "/BACKOUT/" get_logger().warning( f"Schema Validation Failed ({self.schema.last_error}) - message being written to {identity}" ) lock = threading.Lock() try: lock.acquire(blocking=True, timeout=10) # get the placeholders from the dataset name placeholders = set(re.findall(r"\{(.*?)\}", identity)) # there's no substitutions needed, so just write the record if len(placeholders) == 0: blob_writer = self.writer_pool.get_writer(identity) return blob_writer.append(record) # get the values from the record, there can be multiple of these values = [] for placeholder in placeholders: value = record.get(placeholder) if hasattr(value, "as_list"): value = value.as_list() # type:ignore if not isinstance(value, list): value = [value] values.append(value) # get the cartesian product of these lists # save the result to a set otherwise it's not a cartesian product value_combinations = {i for i in itertools.product(*values)} # for every variation in the cartesian product for values in value_combinations: # type:ignore this_identity = identity # do the actual replacing of the placeholders for k, v in zip(placeholders, values): this_identity = this_identity.replace( "{" + k + "}", text.sanitize(str(v))) # get the writer and save the record blob_writer = self.writer_pool.get_writer(this_identity) blob_writer.append(record) writes += 1 finally: lock.release() return writes
def __init__( self, *, dataset: str, format: str = "zstd", idle_timeout_seconds: int = 30, writer_pool_capacity: int = 10, **kwargs, ): """ Create a Data Writer to write data records into partitions. Parameters: dataset: string (optional) The name of the dataset - this is used to map to a path schema: mabel.validator.Schema (optional) Schema used to test records for conformity, default is no schema and therefore no validation format: string (optional) - jsonl: raw json lines - lzma: lzma compressed json lines - zstd: zstandard compressed json lines (default) - parquet: Apache Parquet idle_timeout_seconds: integer (optional) The number of seconds to wait before evicting writers from the pool for inactivity, default is 30 seconds writer_pool_capacity: integer (optional) The number of writers to leave in the writers pool before writers are evicted for over capacity, default is 5 blob_size: integer (optional) The maximum size of blobs, the default is 32Mb inner_writer: BaseWriter (optional) The component used to commit data, the default writer is the NullWriter Note: Different inner_writers may take or require additional parameters. """ if kwargs.get("date"): get_logger().warning( "Cannot specify a `date` for the StreamWriter.") # add the values to kwargs kwargs["format"] = format kwargs["dataset"] = dataset self.dataset = dataset super().__init__(**kwargs) self.idle_timeout_seconds = idle_timeout_seconds # we have a pool of writers of size maximum_writers self.writer_pool_capacity = writer_pool_capacity self.writer_pool = WriterPool(pool_size=writer_pool_capacity, **kwargs) # establish the background thread responsible for the pool self.thread = threading.Thread(target=self.pool_attendant) self.thread.name = "mabel-writer-pool-attendant" self.thread.daemon = True self.run_pool_attendant = True self.thread.start() get_logger().debug("Pool attendant on-duty")
import os import sys sys.path.insert(1, os.path.join(sys.path[0], "..")) from mabel import Reader, DictSet from mabel.data import STORAGE_CLASS from mabel.adapters.disk import DiskReader from mabel.logging import get_logger get_logger().setLevel(5) STORAGE_CLASSES = [ STORAGE_CLASS.NO_PERSISTANCE, STORAGE_CLASS.COMPRESSED_MEMORY, STORAGE_CLASS.MEMORY, STORAGE_CLASS.DISK, ] def get_ds(**kwargs): ds = Reader(inner_reader=DiskReader, dataset="tests/data/tweets", raw_path=True, **kwargs) return ds def test_count(): for storage_class in STORAGE_CLASSES: ds = get_ds(persistence=storage_class) if storage_class == STORAGE_CLASS.NO_PERSISTANCE:
│ Filter │ Apply full set of row filters to the read data │ │ │ │ │ Reduce │ Aggregate │ └────────────┴────────────────────────────────────────────────────────────┘ """ from mabel import logging from . import decompressors, parsers from enum import Enum from functools import reduce from ....utils import paths from ....data.internals.records import flatten from ....data.internals.index import Index from ....data.internals.expression import Expression from ....data.internals.dnf_filters import DnfFilters logger = logging.get_logger() def empty_list(x): return [] class EXTENSION_TYPE(str, Enum): # labels for the file extentions DATA = "DATA" CONTROL = "CONTROL" INDEX = "INDEX" KNOWN_EXTENSIONS = { ".txt": (decompressors.block, parsers.pass_thru, EXTENSION_TYPE.DATA),
def get_list_of_blobs(self): blobs = [] # For each day in the range, get the blobs for us to read for cycle_date in dates.date_range(self.start_date, self.end_date): # Build the path name cycle_path = pathlib.Path( paths.build_path(path=self.dataset, date=cycle_date)) cycle_blobs = list(self.get_blobs_at_path(path=cycle_path)) # Remove any BACKOUT data - this is essentially a DEAD LETTER queue # so we don't want to include in when reading cycle_blobs = [ blob for blob in cycle_blobs if "BACKOUT" not in blob ] # The partitions are stored in folders with the prefix 'by_', as in, # partitioned **by** field name list_of_partitions = { self._extract_by(blob) for blob in cycle_blobs if "/by_" in blob } # If we've been provided a partition_filter search hint, try to use this # first to prune data chosen_partition = "" if self.partition_filter: from mabel.utils import text # break the filter into parts, and make sure they're safe and valid ( partition_filter_field, partition_filter_op, partition_filter_value, ) = self.partition_filter if partition_filter_op not in ("=", "=="): raise NotImplementedError( "`partition_filter` operation can only be equals (`=`)" ) partition_filter_field = text.sanitize(partition_filter_field) partition_filter_value = text.sanitize(partition_filter_value) partition_filter = f"/by_{partition_filter_field}/{partition_filter_field}={partition_filter_value}/" # If we can find the partition in the folder set, then prune to it if any([ f"by_{partition_filter_field}" in by for by in list_of_partitions ]): # Do the pruning cycle_blobs = [ blob for blob in cycle_blobs if partition_filter in blob ] # We only have one partition now list_of_partitions = [f"by_{partition_filter_field}"] get_logger().debug( f"Applied partition filter by: `{partition_filter}`") else: get_logger().debug( f"Wasn't able to find partition to filter by: `{partition_filter}`" ) # If we have multiple 'by_' partitions, pick one (pick the first one) if list_of_partitions: list_of_partitions = sorted(list_of_partitions) chosen_partition = list_of_partitions.pop() if list_of_partitions: get_logger().info( f"Ignoring {len(list_of_partitions)} 'by' partitionings, reading from '{chosen_partition}'" ) # Do the pruning cycle_blobs = [ blob for blob in cycle_blobs if f"/{chosen_partition}/" in blob ] def safe_get_next(lst, item): try: index = lst.index(item) return lst[index + 1] except: return None # Cycle over the list of partitions (e.g. the hour=02 bits) we can't use # the frame id of one on the rest if chosen_partition == "": partitioned_folders = {""} else: partitioned_folders = { safe_get_next(blob.split("/"), chosen_partition) for blob in cycle_blobs } for partitioned_folder in partitioned_folders: partitioned_blobs = [ blob for blob in cycle_blobs if f"{chosen_partition}/{partitioned_folder}" in blob ] # Work out if there's an as_at part as_ats = { self._extract_as_at(blob) for blob in partitioned_blobs if "as_at_" in blob } if as_ats: as_ats = sorted(as_ats) as_at = as_ats.pop() is_complete = lambda blobs: any([ blob for blob in blobs if as_at + "/frame.complete" in blob ]) is_invalid = lambda blobs: any([ blob for blob in blobs if (as_at + "/frame.ignore" in blob) ]) while not is_complete(partitioned_blobs) or is_invalid( partitioned_blobs): if not is_complete(partitioned_blobs): get_logger().debug( f"Frame `{partitioned_folder}/{as_at}` is not complete - `frame.complete` file is not present - skipping this frame." ) if is_invalid(partitioned_blobs): get_logger().debug( f"Frame `{partitioned_folder}/{as_at}` is invalid - `frame.ignore` file is present - skipping this frame." ) if len(as_ats) > 0: as_at = as_ats.pop() else: return [] get_logger().debug(f"Reading from DataSet frame `{as_at}`") partitioned_blobs = [ blob for blob in partitioned_blobs if (as_at in blob) and ("/frame.complete" not in blob) ] blobs += partitioned_blobs return sorted(blobs)