Example #1
0
    def test_parse_config_more_than_one_env_value(self):
        os.environ['TEST_ENV_TAG'] = 'it works!'
        os.environ['OTHER_TEST_TAG'] = 'this works too!'
        test_data = '''
        test1:
            data0: !TEST ${TEST_ENV_TAG}/somethingelse/${OTHER_TEST_TAG}
            data1:  !TEST ${OTHER_TEST_TAG}
        '''
        config = parse_config(data=test_data, tag='!TEST')

        expected_config = {
            'test1': {
                'data0': 'it works!/somethingelse/this works too!',
                'data1': 'this works too!'
            }
        }

        self.assertDictEqual(config, expected_config)
Example #2
0
    def test_parse_config_diff_tag(self):
        os.environ['TEST_ENV_TAG'] = 'it works!'
        os.environ['OTHER_TEST_TAG'] = 'this works too!'
        test_data = '''
        test1:
            data0: !TEST ${TEST_ENV_TAG}
            data1:  !TEST ${OTHER_TEST_TAG}
        '''
        config = parse_config(data=test_data, tag='!TEST')

        expected_config = {
            'test1': {
                'data0': 'it works!',
                'data1': 'this works too!'
            }
        }

        self.assertDictEqual(config, expected_config)
Example #3
0
    def test_parse_config_with_file_path(self):
        os.environ['TEST_ENV_TAG'] = 'it works!'
        os.environ['OTHER_TEST_TAG'] = 'this works too!'
        test_data = '''
        test1:
            data0: !ENV ${TEST_ENV_TAG}
            data1:  !ENV ${OTHER_TEST_TAG}
        '''
        with open(self.test_file_name, 'w') as test_file:
            test_file.write(test_data)

        config = parse_config(path=self.test_file_name)

        expected_config = {
            'test1': {
                'data0': 'it works!',
                'data1': 'this works too!'
            }
        }

        self.assertDictEqual(config, expected_config)
Example #4
0
def main():
    """
    Baskerville commandline arguments
    :return:
    """
    global baskerville_engine, logger
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "pipeline",
        help="Pipeline to use: es, rawlog, or kafka",
    )
    parser.add_argument(
        "-s", "--simulate", dest="simulate",  action="store_true",
        help="Simulate real-time run using kafka",
    )
    parser.add_argument(
        "-e", "--startexporter", dest="start_exporter",
        action="store_true",
        help="Start the Baskerville Prometheus exporter at the specified "
             "in the configuration port",
    )

    parser.add_argument(
        "-t", "--testmodel", dest="test_model",
        help="Add a test model in the models table",
        default=False,
        action="store_true"
    )

    parser.add_argument(
        "-c", "--conf", action="store", dest="conf_file",
        default=os.path.join(src_dir, '..', 'conf', 'baskerville.yaml'),
        help="Path to config file"
    )

    parser.add_argument(
        "-t", "--testmodel", dest="test_model",
        help="Add a test model in the models table",
        default=False,
        action="store_true"
    )

    args = parser.parse_args()
    conf = parse_config(path=args.conf_file)

    baskerville_engine = BaskervilleAnalyticsEngine(
        args.pipeline, conf, register_metrics=args.start_exporter
    )
    logger = get_logger(
        __name__,
        logging_level=baskerville_engine.config.engine.log_level,
        output_file=baskerville_engine.config.engine.logpath
    )

    # start simulation if specified
    if args.simulate:
        spark = None
        if baskerville_engine.config.engine.use_spark:
            from baskerville.spark import get_spark_session
            spark = get_spark_session()  # baskerville.pipeline.spark

        logger.info('Starting simulation...')
        run_simulation(baskerville_engine.config, spark)

    # start baskerville prometheus exporter if specified
    if args.start_exporter:
        if not baskerville_engine.config.engine.metrics:
            raise RuntimeError(f'Cannot start exporter without metrics config')
        port = baskerville_engine.config.engine.metrics.port
        start_http_server(port)
        logger.info(f'Starting Baskerville Exporter at '
                    f'http://localhost:{port}')

    # populate with test data if specified
    if args.test_model:
        add_model_to_database(conf['database'])

    for p in PROCESS_LIST[::-1]:
        print(f"{p.name} starting...")
        p.start()

    logger.info('Starting Baskerville Engine...')
    baskerville_engine.run()
def maintain_db():
    """
    Runs the partitioning and archive scripts
    :return:
    """
    # todo: this can fail silently
    baskerville_root = os.environ.get(
        'BASKERVILLE_ROOT', '../../../../baskerville'
    )
    # we need the current config for the database details
    config = parse_config(path=f'{baskerville_root}/conf/baskerville.yaml')
    logger = get_logger(
        __name__,
        logging_level=config['engine']['log_level'],
        output_file=config['engine']['logpath']
    )
    db_config = DatabaseConfig(config['database']).validate()

    if db_config.maintenance.partition_by != 'week':
        raise NotImplementedError(
            f'Partition by {db_config.maintenance.partition_by} '
            f'is not yet implemented'
        )

    # maintainance will run every Sunday, so now should be Sunday night
    # move to the start of Monday
    now = datetime.utcnow()
    y, w, _ = now.isocalendar()
    partition_start_week = isoweek.Week(y, w + 1)
    start = datetime.combine(
        partition_start_week.monday(), datetime.min.time()
    )
    end = datetime.combine(
        partition_start_week.sunday(), datetime.max.time()
    )

    logger.info(f'Data Partition Start : {start}')

    diy = get_days_in_year(end.year)
    latest_archive_date = end - timedelta(days=diy)
    latest_archive_year, latest_archive_week, _ = latest_archive_date.isocalendar()
    print(latest_archive_week, latest_archive_year)

    if latest_archive_week > 1:
        latest_archive_week = latest_archive_week - 1
    else:
        latest_archive_week = isoweek.Week.last_week_of_year(
            latest_archive_year-1
        ).week
        latest_archive_year = latest_archive_year - 1
    week = isoweek.Week(latest_archive_year, latest_archive_week)

    print(week)

    db_config.maintenance.data_partition.since = start
    db_config.maintenance.data_partition.until = (
        start + timedelta(days=6)
    ).replace(
        hour=23, minute=59, second=59
    )

    db_config.maintenance.data_archive.since = datetime.combine(
        week.monday(), datetime.min.time()
    )
    db_config.maintenance.data_archive.until = datetime.combine(
        week.sunday(), datetime.max.time()
    )

    print(db_config.maintenance.data_partition)
    print(db_config.maintenance.data_archive)

    # get sql scripts
    partition_sql = get_temporal_partitions(db_config.maintenance)

    archive_sql = get_archive_script(
        latest_archive_date - timedelta(weeks=1),
        latest_archive_date
    )

    logger.debug(partition_sql)
    logger.debug(archive_sql)
    session, engine = set_up_db(db_config.__dict__, create=False)

    try:
        # create partitions
        session.execute(partition_sql)
        session.commit()
        print('Partitioning done')
        # detach partitions over a year and attach them to the archive table
        session.execute(archive_sql)
        session.commit()
        print('Archive done')

    except SQLAlchemyError as e:
        traceback.print_exc()
        session.rollback()
        logger.error(f'Error executing maintenance: {e}')
    finally:
        session.close()