Esempio n. 1
0
def run_coalescer(cfg: dict,
                  tables: List[str],
                  periodstr: str,
                  run_once: bool,
                  logger: Logger,
                  no_sqpoller: bool = False) -> None:
    """Run the coalescer.

    Runs it once and returns or periodically depending on the
    value of run_once. It also writes out the coalescer records
    as a parquet file.

    :param cfg: dict, the Suzieq config file read in
    :param tables: List[str], list of table names to coalesce
    :param periodstr: str, the string of how periodically the poller runs,
                      Examples are '1h', '1d' etc.
    :param run_once: bool, True if you want the poller to run just once
    :param logger: logging.Logger, the logger to write logs to
    :param no_sqpoller: bool, write records even when there's no sqpoller rec
    :returns: Nothing
    :rtype: none

    """

    try:
        schemas = Schema(cfg['schema-directory'])
    except Exception as ex:
        logger.error(f'Aborting. Unable to load schema: {str(ex)}')
        print(f'ERROR: Aborting. Unable to load schema: {str(ex)}')
        sys.exit(1)

    coalescer_schema = SchemaForTable('sqCoalescer', schemas)
    pqdb = get_sqdb_engine(cfg, 'sqCoalescer', None, logger)

    status, errmsg = validate_periodstr(periodstr)
    if not status:
        logger.error(errmsg)
        print(f'ERROR: {errmsg}')
        sys.exit(1)

    while True:
        try:
            stats = do_coalesce(cfg, tables, periodstr, logger, no_sqpoller)
        except Exception:
            logger.exception('Coalescer aborted. Continuing')
        # Write the selftats
        if stats:
            df = pd.DataFrame([asdict(x) for x in stats])
            if not df.empty:
                df['sqvers'] = coalescer_schema.version
                df['version'] = SUZIEQ_VERSION
                df['active'] = True
                df['namespace'] = ''
                pqdb.write('sqCoalescer', 'pandas', df, True,
                           coalescer_schema.get_arrow_schema(), None)

        if run_once:
            break
        sleep_time = get_sleep_time(periodstr)
        sleep(sleep_time)
Esempio n. 2
0
 def __init__(self, baseobj):
     self.ctxt = baseobj.ctxt
     self.iobj = baseobj
     self.summary_row_order = []
     self._summarize_on_add_field = []
     self._summarize_on_add_with_query = []
     self._summarize_on_add_list_or_count = []
     self._summarize_on_add_stat = []
     self._summarize_on_perdevice_stat = []
     self._dbeng = get_sqdb_engine(baseobj.ctxt.cfg, baseobj.table, '',
                                   None)
Esempio n. 3
0
 def __init__(self, baseobj):
     self.ctxt = baseobj.ctxt
     self.iobj = baseobj
     self.summary_row_order = []
     self.nsgrp = None
     self.ns_df = pd.DataFrame()
     self.ns = []
     self.summary_df = pd.DataFrame()
     self._summarize_on_add_field = []
     self._summarize_on_add_with_query = []
     self._summarize_on_add_list_or_count = []
     self._summarize_on_add_stat = []
     self._summarize_on_perdevice_stat = []
     self._check_empty_col = 'namespace'
     self._dbeng = get_sqdb_engine(baseobj.ctxt.cfg, baseobj.table, '',
                                   None)
Esempio n. 4
0
def test_transform(input_file):
    to_transform = Yaml2Class(input_file)

    try:
        data_directory = to_transform.transform.data_directory
    except AttributeError:
        print('Invalid transformation file, no data directory')
        pytest.fail('AttributeError', pytrace=True)

    #  Make a copy of the data directory
    temp_dir, tmpfile = _coalescer_init(data_directory)

    cfg = load_sq_config(config_file=tmpfile.name)
    schemas = Schema(cfg['schema-directory'])

    for ele in to_transform.transform.transform:
        query_str_list = []
        # Each transformation has a record => write's happen per record
        for record in ele.record:
            changed_fields = set()
            new_df = pd.DataFrame()
            tables = [x for x in dir(record) if not x.startswith('_')]
            for table in tables:
                # Lets read the data in now that we know the table
                tblobj = get_sqobject(table)
                pq_db = get_sqdb_engine(cfg, table, None, None)
                columns = schemas.fields_for_table(table)
                mod_df = tblobj(config_file=tmpfile.name).get(columns=columns)

                for key in getattr(record, table):
                    query_str = key.match
                    chg_df = pd.DataFrame()
                    if query_str != "all":
                        try:
                            chg_df = mod_df.query(query_str) \
                                           .reset_index(drop=True)
                        except Exception as ex:
                            assert (not ex)
                        query_str_list.append(query_str)
                    else:
                        chg_df = mod_df

                    _process_transform_set(key.set, chg_df, changed_fields)
                    if new_df.empty:
                        new_df = chg_df
                    elif not chg_df.empty:
                        new_df = pd.concat([new_df, chg_df])

                if new_df.empty:
                    continue

                # Write the records now
                _write_verify_transform(new_df, table, pq_db,
                                        SchemaForTable(table,
                                                       schemas), tmpfile.name,
                                        query_str_list, changed_fields)

    # Now we coalesce and verify it works
    from suzieq.sqobjects.tables import TablesObj

    pre_table_df = TablesObj(config_file=tmpfile.name).get()
    do_coalesce(cfg, None)
    _verify_coalescing(temp_dir)

    post_table_df = TablesObj(config_file=tmpfile.name).get()
    assert_df_equal(pre_table_df, post_table_df, None)

    # Run additional tests on the coalesced data
    for ele in to_transform.transform.verify:
        table = [x for x in dir(ele) if not x.startswith('_')][0]
        tblobj = get_sqobject(table)

        for tst in getattr(ele, table):
            start_time = tst.test.get('start-time', '')
            end_time = tst.test.get('end-time', '')

            columns = tst.test.get('columns', ['default'])
            df = tblobj(config_file=tmpfile.name,
                        start_time=start_time,
                        end_time=end_time).get(columns=columns)
            if not df.empty and 'query' in tst.test:
                query_str = tst.test['query']
                df = df.query(query_str).reset_index(drop=True)

            if 'assertempty' in tst.test:
                assert (df.empty)
            elif 'shape' in tst.test:
                shape = tst.test['shape'].split()
                if shape[0] != '*':
                    assert (int(shape[0]) == df.shape[0])
                if shape[1] != '*':
                    assert (int(shape[1]) == df.shape[1])
            else:
                assert (not df.empty)

    _coalescer_cleanup(temp_dir, tmpfile)