def run_coalescer(cfg: dict, tables: List[str], period: str, run_once: bool, logger: Logger, no_sqpoller: bool = False) -> None: """Run the coalescer. Runs it once and returns or periodically depending on the value of run_once. It also writes out the coalescer records as a parquet file. :param cfg: dict, the Suzieq config file read in :param tables: List[str], list of table names to coalesce :param period: str, the string of how periodically the poller runs, Examples are '1h', '1d' etc. :param run_once: bool, True if you want the poller to run just once :param logger: logging.Logger, the logger to write logs to :param no_sqpoller: bool, write records even when there's no sqpoller rec :returns: Nothing :rtype: none """ try: schemas = Schema(cfg['schema-directory']) except Exception as ex: logger.error(f'Aborting. Unable to load schema: {str(ex)}') print(f'ERROR: Aborting. Unable to load schema: {str(ex)}') sys.exit(1) coalescer_schema = SchemaForTable('sqCoalescer', schemas) pqdb = get_sqdb_engine(cfg, 'sqCoalescer', None, logger) if not run_once: now = datetime.now() nextrun = parse(period, settings={'PREFER_DATES_FROM': 'future'}) sleep_time = (nextrun - now).seconds logger.info(f'Got sleep time of {sleep_time} secs') while True: try: stats = do_coalesce(cfg, tables, period, logger, no_sqpoller) except Exception: logger.exception('Coalescer aborted. Continuing') # Write the selftats df = pd.DataFrame([asdict(x) for x in stats]) if not df.empty: df['sqvers'] = coalescer_schema.version df['version'] = SUZIEQ_VERSION df['active'] = True df['namespace'] = '' pqdb.write('sqCoalescer', 'pandas', df, True, coalescer_schema.get_arrow_schema(), None) if run_once: break sleep(sleep_time)
def migrate(self, table_name: str, schema: SchemaForTable) -> None: """Migrates the data for the table specified to latest version :param table_name: str, The name of the table to migrate :param schema: SchemaForTable, the current schema :returns: None :rtype: """ current_vers = schema.version defvals = self._get_default_vals() arrow_schema = schema.get_arrow_schema() schema_def = dict(zip(arrow_schema.names, arrow_schema.types)) for sqvers in self._get_avail_sqvers(table_name, True): if sqvers != current_vers: migrate_rtn = get_migrate_fn(table_name, sqvers, current_vers) if migrate_rtn: dataset = self._get_cp_dataset(table_name, True, sqvers, 'all', '', '') for item in dataset.files: try: namespace = item.split('namespace=')[1] \ .split('/')[0] except IndexError: # Don't convert data not in our template continue df = pd.read_parquet(item) df['sqvers'] = sqvers df['namespace'] = namespace newdf = migrate_rtn(df) cols = newdf.columns # Ensure all fields are present for field in schema_def: if field not in cols: newdf[field] = defvals.get( schema_def[field], '') newdf.drop(columns=['namespace', 'sqvers']) newitem = item.replace(f'sqvers={sqvers}', f'sqvers={current_vers}') newdir = os.path.dirname(newitem) if not os.path.exists(newdir): os.makedirs(newdir, exist_ok=True) table = pa.Table.from_pandas( newdf, schema=schema.get_arrow_schema(), preserve_index=False) pq.write_to_dataset(table, newitem, version="2.0", compression="ZSTD", row_group_size=100000) self.logger.debug( f'Migrated {item} version {sqvers}->{current_vers}' ) os.remove(item) rmtree( f'{self._get_table_directory(table_name, True)}/sqvers={sqvers}', ignore_errors=True) return