def run_coalescer(cfg: dict, tables: List[str], periodstr: str, run_once: bool, logger: Logger, no_sqpoller: bool = False) -> None: """Run the coalescer. Runs it once and returns or periodically depending on the value of run_once. It also writes out the coalescer records as a parquet file. :param cfg: dict, the Suzieq config file read in :param tables: List[str], list of table names to coalesce :param periodstr: str, the string of how periodically the poller runs, Examples are '1h', '1d' etc. :param run_once: bool, True if you want the poller to run just once :param logger: logging.Logger, the logger to write logs to :param no_sqpoller: bool, write records even when there's no sqpoller rec :returns: Nothing :rtype: none """ try: schemas = Schema(cfg['schema-directory']) except Exception as ex: logger.error(f'Aborting. Unable to load schema: {str(ex)}') print(f'ERROR: Aborting. Unable to load schema: {str(ex)}') sys.exit(1) coalescer_schema = SchemaForTable('sqCoalescer', schemas) pqdb = get_sqdb_engine(cfg, 'sqCoalescer', None, logger) status, errmsg = validate_periodstr(periodstr) if not status: logger.error(errmsg) print(f'ERROR: {errmsg}') sys.exit(1) while True: try: stats = do_coalesce(cfg, tables, periodstr, logger, no_sqpoller) except Exception: logger.exception('Coalescer aborted. Continuing') # Write the selftats if stats: df = pd.DataFrame([asdict(x) for x in stats]) if not df.empty: df['sqvers'] = coalescer_schema.version df['version'] = SUZIEQ_VERSION df['active'] = True df['namespace'] = '' pqdb.write('sqCoalescer', 'pandas', df, True, coalescer_schema.get_arrow_schema(), None) if run_once: break sleep_time = get_sleep_time(periodstr) sleep(sleep_time)
def __init__(self, baseobj): self.ctxt = baseobj.ctxt self.iobj = baseobj self.summary_row_order = [] self._summarize_on_add_field = [] self._summarize_on_add_with_query = [] self._summarize_on_add_list_or_count = [] self._summarize_on_add_stat = [] self._summarize_on_perdevice_stat = [] self._dbeng = get_sqdb_engine(baseobj.ctxt.cfg, baseobj.table, '', None)
def __init__(self, baseobj): self.ctxt = baseobj.ctxt self.iobj = baseobj self.summary_row_order = [] self.nsgrp = None self.ns_df = pd.DataFrame() self.ns = [] self.summary_df = pd.DataFrame() self._summarize_on_add_field = [] self._summarize_on_add_with_query = [] self._summarize_on_add_list_or_count = [] self._summarize_on_add_stat = [] self._summarize_on_perdevice_stat = [] self._check_empty_col = 'namespace' self._dbeng = get_sqdb_engine(baseobj.ctxt.cfg, baseobj.table, '', None)
def test_transform(input_file): to_transform = Yaml2Class(input_file) try: data_directory = to_transform.transform.data_directory except AttributeError: print('Invalid transformation file, no data directory') pytest.fail('AttributeError', pytrace=True) # Make a copy of the data directory temp_dir, tmpfile = _coalescer_init(data_directory) cfg = load_sq_config(config_file=tmpfile.name) schemas = Schema(cfg['schema-directory']) for ele in to_transform.transform.transform: query_str_list = [] # Each transformation has a record => write's happen per record for record in ele.record: changed_fields = set() new_df = pd.DataFrame() tables = [x for x in dir(record) if not x.startswith('_')] for table in tables: # Lets read the data in now that we know the table tblobj = get_sqobject(table) pq_db = get_sqdb_engine(cfg, table, None, None) columns = schemas.fields_for_table(table) mod_df = tblobj(config_file=tmpfile.name).get(columns=columns) for key in getattr(record, table): query_str = key.match chg_df = pd.DataFrame() if query_str != "all": try: chg_df = mod_df.query(query_str) \ .reset_index(drop=True) except Exception as ex: assert (not ex) query_str_list.append(query_str) else: chg_df = mod_df _process_transform_set(key.set, chg_df, changed_fields) if new_df.empty: new_df = chg_df elif not chg_df.empty: new_df = pd.concat([new_df, chg_df]) if new_df.empty: continue # Write the records now _write_verify_transform(new_df, table, pq_db, SchemaForTable(table, schemas), tmpfile.name, query_str_list, changed_fields) # Now we coalesce and verify it works from suzieq.sqobjects.tables import TablesObj pre_table_df = TablesObj(config_file=tmpfile.name).get() do_coalesce(cfg, None) _verify_coalescing(temp_dir) post_table_df = TablesObj(config_file=tmpfile.name).get() assert_df_equal(pre_table_df, post_table_df, None) # Run additional tests on the coalesced data for ele in to_transform.transform.verify: table = [x for x in dir(ele) if not x.startswith('_')][0] tblobj = get_sqobject(table) for tst in getattr(ele, table): start_time = tst.test.get('start-time', '') end_time = tst.test.get('end-time', '') columns = tst.test.get('columns', ['default']) df = tblobj(config_file=tmpfile.name, start_time=start_time, end_time=end_time).get(columns=columns) if not df.empty and 'query' in tst.test: query_str = tst.test['query'] df = df.query(query_str).reset_index(drop=True) if 'assertempty' in tst.test: assert (df.empty) elif 'shape' in tst.test: shape = tst.test['shape'].split() if shape[0] != '*': assert (int(shape[0]) == df.shape[0]) if shape[1] != '*': assert (int(shape[1]) == df.shape[1]) else: assert (not df.empty) _coalescer_cleanup(temp_dir, tmpfile)