Ejemplo n.º 1
0
 def finalize(self, **kwargs):
     self.run_pool_attendant = False
     lock = threading.Lock()
     try:
         lock.acquire(blocking=True, timeout=10)
         for blob_writer_identity in self.writer_pool.writers:
             try:
                 get_logger().debug(
                     f"Removing from the writer pool during finalization, identity={blob_writer_identity['identity']}, poolsize={len(self.writer_pool.writers)}"
                 )
                 self.writer_pool.remove_writer(
                     blob_writer_identity["identity"])
             except Exception as err:
                 get_logger().debug(
                     f"Error finalizing `{blob_writer_identity}`, {type(err).__name__} - {err}"
                 )
     finally:
         lock.release()
     return super().finalize()
Ejemplo n.º 2
0
    def pool_attendant(self):
        """
        Writer Pool Management
        """
        while self.run_pool_attendant:
            lock = threading.Lock()
            try:
                lock.acquire(blocking=True, timeout=10)

                # search for pool occupants who haven't had a write recently
                for blob_writer_identity in self.writer_pool.get_stale_writers(
                        self.idle_timeout_seconds):
                    get_logger().debug(
                        f"Evicting {blob_writer_identity} from the writer pool due to inactivity - limit is {self.idle_timeout_seconds} seconds, poolsize={len(self.writer_pool.writers)}"
                    )
                    self.writer_pool.remove_writer(blob_writer_identity)
                # if we're over capacity, evict the LRU writers
                for (blob_writer_identity
                     ) in self.writer_pool.nominate_writers_to_evict():
                    get_logger().debug(
                        f"Evicting {blob_writer_identity} from the writer pool due the pool being over its {self.writer_pool_capacity} capacity, poolsize={len(self.writer_pool.writers)}"
                    )
                    self.writer_pool.remove_writer(blob_writer_identity)

            finally:
                lock.release()

            time.sleep(0.1)

        get_logger().debug("Pool attendant off-duty")
Ejemplo n.º 3
0
def test_log_sanitizer():
    """
    caplog records the message before the formatter so can't be used to
    test sanitization of logs automatically.

    These are another set of smoke tests, these should run without
    error.
    """
    logger = get_logger()
    logger.audit({"password": "******"})
    logger.audit('{"password": "******"}')
    logger.audit("password:topsecret3")
    logger.audit(["password", "top secret 4"])
    logger.debug({"alpha": "`1`", "beta": "`2`", "gamma": "'3'", "delta": "'4'"})
    logger.debug("{'action':'list','limit':'1000'}")
Ejemplo n.º 4
0
def test_smoke_test():
    """
    This is just a smoke test, it exercises most of the logging functionality
    it should just work.
    """
    logger = get_logger()
    logger.setLevel(LEVELS.DEBUG)
    logger.debug("debug")
    logger.warning("warn")
    logger.warning("warn")
    logger.warning("warn")
    logger.warning("warn")
    logger.warning("warned")
    logger.warning("warn")
    logger.warning("warn")
    logger.warning("warned")
    logger.warning("warn")
    logger.warning("warn")
    logger.warning("warn")
    logger.error("error")
    logger.alert("alert")
    logger.audit("audit")
Ejemplo n.º 5
0
def test_new_log_levels(caplog):
    """
    caplog is a feature of pytest that allows logs to be captured and
    inspected.

    It is passed to a test function and has an attribute .record_tuples which
    is a stack of logging messages. To read the last item, pop it off the
    stack. It is in the form of:

    log name, log level, log message
    """
    if caplog is None:  # pragma: no cover
        print("unable to test logging interactively - use pytest")
        return

    logger = get_logger()

    logger.audit("this is a sample audit")
    res = caplog.record_tuples.pop()
    assert res == (LOG_NAME, LEVELS.AUDIT, "this is a sample audit"), res

    logger.alert("this is a sample alert")
    res = caplog.record_tuples.pop()
    assert res == (LOG_NAME, LEVELS.ALERT, "this is a sample alert"), res
Ejemplo n.º 6
0
import sys
import os
import time
import statistics

sys.path.insert(1, os.path.join(sys.path[0], "../.."))
from mabel.data import BatchWriter
from mabel.adapters.null import NullWriter
from mabel.adapters.disk import DiskWriter
from mabel.logging import get_logger
from mabel.data.validator import Schema
from mabel.data.internals import display, dictset

import ujson as json

logger = get_logger()
logger.setLevel(100)

schema_definition = {
    "description":
    "test data",
    "fields": [
        {
            "name": "user_id",
            "type": "numeric"
        },
        {
            "name": "user_name",
            "type": "string"
        },
        {
Ejemplo n.º 7
0
        raw_path=True,
        filters=("user_name", "==", username),
    )
    res = [r for r in reader]
    print(len(res))
    return (time.perf_counter_ns() - start) / 1e9


import os, sys

sys.path.insert(1, os.path.join(sys.path[0], "../.."))
from mabel.data import Reader
from mabel.adapters.disk import DiskReader
from mabel.logging import get_logger

get_logger().setLevel(100)

user_name = "Verizon Support"

print("indexed\t:", time_it("tests/data/index/is", user_name))
print("not indexed\t:", time_it("tests/data/index/not", user_name))
print("indexed\t:", time_it("tests/data/index/is", user_name + "bb"))
print("not indexed\t:", time_it("tests/data/index/not", user_name + "bb"))


reader = Reader(
    inner_reader=DiskReader,
    dataset="tests/data/index/is",
    raw_path=True,
)
idx = Index.build_index(reader, "user_name")
Ejemplo n.º 8
0
    def append(self, record: Union[dict, BaseModel]):
        """
        Append a new record to the Writer

        Parameters:
            record: dictionary
                The record to append to the Writer

        Returns:
            integer
                The number of records in the current blob
        """
        # get the appropritate writer from the pool and append the record
        # the writer identity is the base of the path where the partitions
        # are written.

        # Check the new record conforms to the schema
        # unlike the batch writer, we don't want to bail out if we have a
        # problem here, instead we're going to save the file to a BACKOUT
        # partition

        writes = 0
        identity = paths.date_format(self.dataset_template,
                                     datetime.date.today())

        if isinstance(record, BaseModel):
            record = record.dict()
        elif self.schema and not self.schema.validate(subject=record,
                                                      raise_exception=False):
            identity += "/BACKOUT/"
            get_logger().warning(
                f"Schema Validation Failed ({self.schema.last_error}) - message being written to {identity}"
            )

        lock = threading.Lock()
        try:
            lock.acquire(blocking=True, timeout=10)

            # get the placeholders from the dataset name
            placeholders = set(re.findall(r"\{(.*?)\}", identity))

            # there's no substitutions needed, so just write the record
            if len(placeholders) == 0:
                blob_writer = self.writer_pool.get_writer(identity)
                return blob_writer.append(record)

            # get the values from the record, there can be multiple of these
            values = []
            for placeholder in placeholders:
                value = record.get(placeholder)
                if hasattr(value, "as_list"):
                    value = value.as_list()  # type:ignore
                if not isinstance(value, list):
                    value = [value]
                values.append(value)
            # get the cartesian product of these lists
            # save the result to a set otherwise it's not a cartesian product
            value_combinations = {i for i in itertools.product(*values)}

            # for every variation in the cartesian product
            for values in value_combinations:  # type:ignore

                this_identity = identity
                # do the actual replacing of the placeholders
                for k, v in zip(placeholders, values):
                    this_identity = this_identity.replace(
                        "{" + k + "}", text.sanitize(str(v)))

                # get the writer and save the record
                blob_writer = self.writer_pool.get_writer(this_identity)
                blob_writer.append(record)
                writes += 1
        finally:
            lock.release()

        return writes
Ejemplo n.º 9
0
    def __init__(
        self,
        *,
        dataset: str,
        format: str = "zstd",
        idle_timeout_seconds: int = 30,
        writer_pool_capacity: int = 10,
        **kwargs,
    ):
        """
        Create a Data Writer to write data records into partitions.

        Parameters:
            dataset: string (optional)
                The name of the dataset - this is used to map to a path
            schema: mabel.validator.Schema (optional)
                Schema used to test records for conformity, default is no
                schema and therefore no validation
            format: string (optional)
                - jsonl: raw json lines
                - lzma: lzma compressed json lines
                - zstd: zstandard compressed json lines (default)
                - parquet: Apache Parquet
            idle_timeout_seconds: integer (optional)
                The number of seconds to wait before evicting writers from the
                pool for inactivity, default is 30 seconds
            writer_pool_capacity: integer (optional)
                The number of writers to leave in the writers pool before
                writers are evicted for over capacity, default is 5
            blob_size: integer (optional)
                The maximum size of blobs, the default is 32Mb
            inner_writer: BaseWriter (optional)
                The component used to commit data, the default writer is the
                NullWriter

        Note:
            Different inner_writers may take or require additional parameters.
        """
        if kwargs.get("date"):
            get_logger().warning(
                "Cannot specify a `date` for the StreamWriter.")

        # add the values to kwargs
        kwargs["format"] = format
        kwargs["dataset"] = dataset
        self.dataset = dataset

        super().__init__(**kwargs)

        self.idle_timeout_seconds = idle_timeout_seconds

        # we have a pool of writers of size maximum_writers
        self.writer_pool_capacity = writer_pool_capacity
        self.writer_pool = WriterPool(pool_size=writer_pool_capacity, **kwargs)

        # establish the background thread responsible for the pool
        self.thread = threading.Thread(target=self.pool_attendant)
        self.thread.name = "mabel-writer-pool-attendant"
        self.thread.daemon = True
        self.run_pool_attendant = True
        self.thread.start()
        get_logger().debug("Pool attendant on-duty")
Ejemplo n.º 10
0
import os
import sys

sys.path.insert(1, os.path.join(sys.path[0], ".."))
from mabel import Reader, DictSet
from mabel.data import STORAGE_CLASS
from mabel.adapters.disk import DiskReader
from mabel.logging import get_logger

get_logger().setLevel(5)

STORAGE_CLASSES = [
    STORAGE_CLASS.NO_PERSISTANCE,
    STORAGE_CLASS.COMPRESSED_MEMORY,
    STORAGE_CLASS.MEMORY,
    STORAGE_CLASS.DISK,
]


def get_ds(**kwargs):
    ds = Reader(inner_reader=DiskReader,
                dataset="tests/data/tweets",
                raw_path=True,
                **kwargs)
    return ds


def test_count():
    for storage_class in STORAGE_CLASSES:
        ds = get_ds(persistence=storage_class)
        if storage_class == STORAGE_CLASS.NO_PERSISTANCE:
Ejemplo n.º 11
0
│ Filter     │ Apply full set of row filters to the read data             │
│            │                                                            │
│ Reduce     │ Aggregate                                                  │
└────────────┴────────────────────────────────────────────────────────────┘
"""
from mabel import logging
from . import decompressors, parsers
from enum import Enum
from functools import reduce
from ....utils import paths
from ....data.internals.records import flatten
from ....data.internals.index import Index
from ....data.internals.expression import Expression
from ....data.internals.dnf_filters import DnfFilters

logger = logging.get_logger()


def empty_list(x):
    return []


class EXTENSION_TYPE(str, Enum):
    # labels for the file extentions
    DATA = "DATA"
    CONTROL = "CONTROL"
    INDEX = "INDEX"


KNOWN_EXTENSIONS = {
    ".txt": (decompressors.block, parsers.pass_thru, EXTENSION_TYPE.DATA),
Ejemplo n.º 12
0
    def get_list_of_blobs(self):

        blobs = []
        # For each day in the range, get the blobs for us to read
        for cycle_date in dates.date_range(self.start_date, self.end_date):
            # Build the path name
            cycle_path = pathlib.Path(
                paths.build_path(path=self.dataset, date=cycle_date))
            cycle_blobs = list(self.get_blobs_at_path(path=cycle_path))

            # Remove any BACKOUT data - this is essentially a DEAD LETTER queue
            # so we don't want to include in when reading
            cycle_blobs = [
                blob for blob in cycle_blobs if "BACKOUT" not in blob
            ]

            # The partitions are stored in folders with the prefix 'by_', as in,
            # partitioned **by** field name
            list_of_partitions = {
                self._extract_by(blob)
                for blob in cycle_blobs if "/by_" in blob
            }

            # If we've been provided a partition_filter search hint, try to use this
            # first to prune data
            chosen_partition = ""

            if self.partition_filter:
                from mabel.utils import text

                # break the filter into parts, and make sure they're safe and valid
                (
                    partition_filter_field,
                    partition_filter_op,
                    partition_filter_value,
                ) = self.partition_filter
                if partition_filter_op not in ("=", "=="):
                    raise NotImplementedError(
                        "`partition_filter` operation can only be equals (`=`)"
                    )
                partition_filter_field = text.sanitize(partition_filter_field)
                partition_filter_value = text.sanitize(partition_filter_value)
                partition_filter = f"/by_{partition_filter_field}/{partition_filter_field}={partition_filter_value}/"

                # If we can find the partition in the folder set, then prune to it
                if any([
                        f"by_{partition_filter_field}" in by
                        for by in list_of_partitions
                ]):
                    # Do the pruning
                    cycle_blobs = [
                        blob for blob in cycle_blobs
                        if partition_filter in blob
                    ]
                    #  We only have one partition now
                    list_of_partitions = [f"by_{partition_filter_field}"]
                    get_logger().debug(
                        f"Applied partition filter by: `{partition_filter}`")
                else:
                    get_logger().debug(
                        f"Wasn't able to find partition to filter by: `{partition_filter}`"
                    )

            # If we have multiple 'by_' partitions, pick one (pick the first one)
            if list_of_partitions:
                list_of_partitions = sorted(list_of_partitions)
                chosen_partition = list_of_partitions.pop()
                if list_of_partitions:
                    get_logger().info(
                        f"Ignoring {len(list_of_partitions)} 'by' partitionings, reading from '{chosen_partition}'"
                    )
                # Do the pruning
                cycle_blobs = [
                    blob for blob in cycle_blobs
                    if f"/{chosen_partition}/" in blob
                ]

            def safe_get_next(lst, item):
                try:
                    index = lst.index(item)
                    return lst[index + 1]
                except:
                    return None

            # Cycle over the list of partitions (e.g. the hour=02 bits) we can't use
            # the frame id of one on the rest
            if chosen_partition == "":
                partitioned_folders = {""}
            else:
                partitioned_folders = {
                    safe_get_next(blob.split("/"), chosen_partition)
                    for blob in cycle_blobs
                }

            for partitioned_folder in partitioned_folders:

                partitioned_blobs = [
                    blob for blob in cycle_blobs
                    if f"{chosen_partition}/{partitioned_folder}" in blob
                ]

                # Work out if there's an as_at part
                as_ats = {
                    self._extract_as_at(blob)
                    for blob in partitioned_blobs if "as_at_" in blob
                }
                if as_ats:
                    as_ats = sorted(as_ats)
                    as_at = as_ats.pop()

                    is_complete = lambda blobs: any([
                        blob for blob in blobs
                        if as_at + "/frame.complete" in blob
                    ])
                    is_invalid = lambda blobs: any([
                        blob for blob in blobs
                        if (as_at + "/frame.ignore" in blob)
                    ])

                    while not is_complete(partitioned_blobs) or is_invalid(
                            partitioned_blobs):
                        if not is_complete(partitioned_blobs):
                            get_logger().debug(
                                f"Frame `{partitioned_folder}/{as_at}` is not complete - `frame.complete` file is not present - skipping this frame."
                            )
                        if is_invalid(partitioned_blobs):
                            get_logger().debug(
                                f"Frame `{partitioned_folder}/{as_at}` is invalid - `frame.ignore` file is present - skipping this frame."
                            )
                        if len(as_ats) > 0:
                            as_at = as_ats.pop()
                        else:
                            return []
                    get_logger().debug(f"Reading from DataSet frame `{as_at}`")
                    partitioned_blobs = [
                        blob for blob in partitioned_blobs
                        if (as_at in blob) and ("/frame.complete" not in blob)
                    ]

                blobs += partitioned_blobs

        return sorted(blobs)