Exemple #1
0
    def clean_up_before_shutdown():
        global baskerville_engine, logger

        if not logger:
            logger = get_logger('clean_up_before_shutdown')

        logger.info('Just a sec, finishing up...')
        if baskerville_engine:
            logger.info('Finishing up Baskerville...')
            baskerville_engine.finish_up()
        for each in PROCESS_LIST:
            each.terminate()
            each.join()
            logger.info(f'Stopped {each.name}...')
Exemple #2
0
    def __init__(self, engine_conf):
        self.all_features = engine_conf.all_features
        self.extra_features = engine_conf.extra_features
        self.active_features = None
        self.active_feature_names = None
        self.updateable_active_features = None
        self.active_columns = None
        self.update_feature_cols = None
        self.column_renamings = None
        self.pre_group_by_calculations = None
        self.post_group_by_calculations = None

        self.logger = get_logger(self.__class__.__name__,
                                 logging_level=engine_conf.log_level,
                                 output_file=engine_conf.logpath)
Exemple #3
0
 def __init__(self, db_conf, engine_conf, clean_up):
     self.runtime = None
     # todo: does not belong here anymore - see feature manager
     self.active_features = None
     self.step_to_action = None
     self.remaining_steps = None
     self.logs_df = None
     self.db_conf = db_conf
     self.engine_conf = engine_conf
     self.all_features = self.engine_conf.all_features
     self.clean_up = clean_up
     self.db_url = get_jdbc_url(self.db_conf)
     self.logger = get_logger(self.__class__.__name__,
                              logging_level=self.engine_conf.log_level,
                              output_file=self.engine_conf.logpath)
Exemple #4
0
    def __init__(self, run_type, conf, register_metrics=True):
        super(BaskervilleAnalyticsEngine, self).__init__(conf)
        self.run_type = run_type
        self.pipeline = None
        self.performance_stats = None

        # set config's logger
        BaskervilleConfig.set_logger(conf['engine']['log_level'],
                                     conf['engine']['logpath'])
        self.config = BaskervilleConfig(self.config).validate()

        self.register_metrics = (self.config.engine.metrics
                                 and register_metrics)

        self.logger = get_logger(self.__class__.__name__,
                                 logging_level=conf['engine']['log_level'],
                                 output_file=conf['engine']['logpath'])
    def __init__(self,
                 cache_config,
                 table_name,
                 columns_to_keep,
                 expire_if_longer_than=3600,
                 logger=None,
                 session_getter=get_spark_session,
                 group_by_fields=('target', 'ip'),
                 format_='parquet',
                 path='request_set_cache'):
        self.__cache = None
        self.__persistent_cache = None
        self.schema = None
        self.cache_config = cache_config
        self.table_name = table_name
        self.columns_to_keep = columns_to_keep
        self.expire_if_longer_than = expire_if_longer_than
        self.logger = logger if logger else get_logger(self.__class__.__name__)
        self.session_getter = session_getter
        self.group_by_fields = group_by_fields
        self.format_ = format_
        self.storage_level = StorageLevel.CUSTOM
        self.column_renamings = {
            'first_ever_request': 'start',
            'old_subset_count': 'subset_count',
            'old_features': 'features',
            'old_num_requests': 'num_requests',
        }
        self._count = 0
        self._last_updated = datetime.datetime.utcnow()
        self._changed = False
        self.file_manager = FileManager(path, self.session_getter())

        self.file_name = os.path.join(
            path, f'{self.__class__.__name__}.{self.format_}')
        self.temp_file_name = os.path.join(
            path, f'{self.__class__.__name__}temp.{self.format_}')

        if self.file_manager.path_exists(self.file_name):
            self.file_manager.delete_path(self.file_name)
        if self.file_manager.path_exists(self.temp_file_name):
            self.file_manager.delete_path(self.temp_file_name)
Exemple #6
0
def main():
    """
    Baskerville commandline arguments
    :return:
    """
    global baskerville_engine, logger
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "pipeline",
        help="Pipeline to use: es, rawlog, or kafka",
    )
    parser.add_argument(
        "-s", "--simulate", dest="simulate",  action="store_true",
        help="Simulate real-time run using kafka",
    )
    parser.add_argument(
        "-e", "--startexporter", dest="start_exporter",
        action="store_true",
        help="Start the Baskerville Prometheus exporter at the specified "
             "in the configuration port",
    )

    parser.add_argument(
        "-t", "--testmodel", dest="test_model",
        help="Add a test model in the models table",
        default=False,
        action="store_true"
    )

    parser.add_argument(
        "-c", "--conf", action="store", dest="conf_file",
        default=os.path.join(src_dir, '..', 'conf', 'baskerville.yaml'),
        help="Path to config file"
    )

    parser.add_argument(
        "-t", "--testmodel", dest="test_model",
        help="Add a test model in the models table",
        default=False,
        action="store_true"
    )

    args = parser.parse_args()
    conf = parse_config(path=args.conf_file)

    baskerville_engine = BaskervilleAnalyticsEngine(
        args.pipeline, conf, register_metrics=args.start_exporter
    )
    logger = get_logger(
        __name__,
        logging_level=baskerville_engine.config.engine.log_level,
        output_file=baskerville_engine.config.engine.logpath
    )

    # start simulation if specified
    if args.simulate:
        spark = None
        if baskerville_engine.config.engine.use_spark:
            from baskerville.spark import get_spark_session
            spark = get_spark_session()  # baskerville.pipeline.spark

        logger.info('Starting simulation...')
        run_simulation(baskerville_engine.config, spark)

    # start baskerville prometheus exporter if specified
    if args.start_exporter:
        if not baskerville_engine.config.engine.metrics:
            raise RuntimeError(f'Cannot start exporter without metrics config')
        port = baskerville_engine.config.engine.metrics.port
        start_http_server(port)
        logger.info(f'Starting Baskerville Exporter at '
                    f'http://localhost:{port}')

    # populate with test data if specified
    if args.test_model:
        add_model_to_database(conf['database'])

    for p in PROCESS_LIST[::-1]:
        print(f"{p.name} starting...")
        p.start()

    logger.info('Starting Baskerville Engine...')
    baskerville_engine.run()
Exemple #7
0
 def set_logger(cls, log_level='INFO', log_path='baskerville.log'):
     global logger
     logger = get_logger(cls.__name__,
                         logging_level=log_level,
                         output_file=log_path)
Exemple #8
0
# LICENSE file in the root directory of this source tree.

import json
import os
import warnings
from datetime import datetime
from functools import wraps

import dateutil
from baskerville.util.enums import ModelEnum
from baskerville.util.helpers import get_logger, get_default_data_path, \
    SerializableMixin
from dateutil.tz import tzutc
from baskerville.features import FEATURES

logger = get_logger(__name__)


class ConfigError(Exception, SerializableMixin):
    """
    Custom Error to be used in the configuration error report
    """
    def __init__(self, message, fields, exception_type=ValueError):
        if isinstance(fields, str):
            fields = [fields]
        self.args = message, fields, exception_type.__name__

    def __str__(self):
        m, f, e = self.args
        return f'({e}, `field(s)`: {",".join(f)}){m} '
Exemple #9
0
 def __init__(self):
     self.__registry = {}
     self.logger = get_logger(self.__class__.__name__)
def maintain_db():
    """
    Runs the partitioning and archive scripts
    :return:
    """
    # todo: this can fail silently
    baskerville_root = os.environ.get(
        'BASKERVILLE_ROOT', '../../../../baskerville'
    )
    # we need the current config for the database details
    config = parse_config(path=f'{baskerville_root}/conf/baskerville.yaml')
    logger = get_logger(
        __name__,
        logging_level=config['engine']['log_level'],
        output_file=config['engine']['logpath']
    )
    db_config = DatabaseConfig(config['database']).validate()

    if db_config.maintenance.partition_by != 'week':
        raise NotImplementedError(
            f'Partition by {db_config.maintenance.partition_by} '
            f'is not yet implemented'
        )

    # maintainance will run every Sunday, so now should be Sunday night
    # move to the start of Monday
    now = datetime.utcnow()
    y, w, _ = now.isocalendar()
    partition_start_week = isoweek.Week(y, w + 1)
    start = datetime.combine(
        partition_start_week.monday(), datetime.min.time()
    )
    end = datetime.combine(
        partition_start_week.sunday(), datetime.max.time()
    )

    logger.info(f'Data Partition Start : {start}')

    diy = get_days_in_year(end.year)
    latest_archive_date = end - timedelta(days=diy)
    latest_archive_year, latest_archive_week, _ = latest_archive_date.isocalendar()
    print(latest_archive_week, latest_archive_year)

    if latest_archive_week > 1:
        latest_archive_week = latest_archive_week - 1
    else:
        latest_archive_week = isoweek.Week.last_week_of_year(
            latest_archive_year-1
        ).week
        latest_archive_year = latest_archive_year - 1
    week = isoweek.Week(latest_archive_year, latest_archive_week)

    print(week)

    db_config.maintenance.data_partition.since = start
    db_config.maintenance.data_partition.until = (
        start + timedelta(days=6)
    ).replace(
        hour=23, minute=59, second=59
    )

    db_config.maintenance.data_archive.since = datetime.combine(
        week.monday(), datetime.min.time()
    )
    db_config.maintenance.data_archive.until = datetime.combine(
        week.sunday(), datetime.max.time()
    )

    print(db_config.maintenance.data_partition)
    print(db_config.maintenance.data_archive)

    # get sql scripts
    partition_sql = get_temporal_partitions(db_config.maintenance)

    archive_sql = get_archive_script(
        latest_archive_date - timedelta(weeks=1),
        latest_archive_date
    )

    logger.debug(partition_sql)
    logger.debug(archive_sql)
    session, engine = set_up_db(db_config.__dict__, create=False)

    try:
        # create partitions
        session.execute(partition_sql)
        session.commit()
        print('Partitioning done')
        # detach partitions over a year and attach them to the archive table
        session.execute(archive_sql)
        session.commit()
        print('Archive done')

    except SQLAlchemyError as e:
        traceback.print_exc()
        session.rollback()
        logger.error(f'Error executing maintenance: {e}')
    finally:
        session.close()