def coalescer_main(): parser = argparse.ArgumentParser() parser.add_argument( "-s", "--service-only", type=str, help="Only run this space separated list of services", ) parser.add_argument( "-x", "--exclude-services", type=str, help="Exclude running this space separated list of services", ) parser.add_argument("-c", "--config", default=f'{os.getenv("HOME")}/.suzieq/suzieq-cfg.yml', type=str, help="alternate config file") parser.add_argument( "--run-once", default=False, help='Run the coalescer once and exit', action='store_true', ) parser.add_argument( "-p", "--period", type=str, help=('Override the period specified in config file with this. ' 'Format is <period><h|d|w|y>. 1h is 1 hour, 2w is 2 weeks etc.')) parser.add_argument("--no-sqpoller", action='store_true', help=argparse.SUPPRESS) userargs = parser.parse_args() cfg = load_sq_config(config_file=userargs.config) if not cfg: print(f'Invalid Suzieq config file {userargs.config}') sys.exit(1) logfile, loglevel = get_log_file_level('coalescer', cfg, '/tmp/sq-coalescer.log') logger = init_logger('suzieq.coalescer', logfile, loglevel, False) # Ensure we're the only compacter coalesce_dir = cfg.get('coalescer', {})\ .get('coalesce-directory', f'{cfg.get("data-directory")}/coalesced') fd = ensure_single_instance(f'{coalesce_dir}/.sq-coalescer.pid', False) if not fd: print(f'ERROR: Another coalescer process present') logger.error(f'Another coalescer process present') sys.exit(errno.EBUSY) if userargs.run_once: timestr = '' elif not userargs.period: timestr = cfg.get('coalescer', {'period': '1h'}).get('period', '1h') else: timestr = userargs.period schemas = Schema(cfg.get('schema-directory')) if userargs.service_only or userargs.exclude_services: tables = [ x for x in schemas.tables() if (schemas.type_for_table(x) != "derivedRecord") ] if userargs.service_only: tables = [x for x in tables if x in userargs.service_only.split()] if userargs.exclude_services: tables = [ x for x in tables if x not in userargs.exclude_services.split() ] else: tables = [] run_coalescer(cfg, tables, timestr, userargs.run_once, logger, userargs.no_sqpoller or False) os.truncate(fd, 0) try: fcntl.flock(fd, fcntl.LOCK_UN) os.close(fd) except OSError: pass sys.exit(0)
def coalesce(self, tables: List[str] = [], period: str = '', ign_sqpoller: bool = False) -> None: """Coalesce all the resource parquet files in specified folder. This routine does not run periodically. It runs once and returns. :param tables: List[str], List of specific tables to coalesce, empty for all :param period: str, coalescing period, needed for various internal stuff :param ign_sqpoller: True if its OK to ignore the absence of sqpoller to coalesce :returns: coalesce statistics list, one per table :rtype: SqCoalesceStats """ infolder = self.cfg['data-directory'] outfolder = self._get_table_directory('', True) # root folder archive_folder = self.cfg.get('coalescer', {}) \ .get('archive-directory', f'{infolder}/_archived') if not period: period = self.cfg.get('coalesceer', { 'period': '1h' }).get('period', '1h') schemas = Schema(self.cfg.get('schema-directory')) state = SqCoalesceState(self.logger, period) state.logger = self.logger # Trying to be complete here. the ignore prefixes assumes you have coalesceers # across multiple time periods running, and so we need to ignore the files # created by the longer time period coalesceions. In other words, weekly # coalesceer should ignore monthly and yearly coalesced files, monthly # coalesceer should ignore yearly coalesceer and so on. try: timeint = int(period[:-1]) time_unit = period[-1] if time_unit == 'h': run_int = timedelta(hours=timeint) state.prefix = 'sqc-h-' state.ign_pfx = ['.', '_', 'sqc-'] elif time_unit == 'd': run_int = timedelta(days=timeint) if timeint > 364: state.prefix = 'sqc-y-' state.ign_pfx = ['.', '_', 'sqc-y-'] elif timeint > 29: state.prefix = 'sqc-m-' state.ign_pfx = ['.', '_', 'sqc-m-', 'sqc-y-'] else: state.prefix = 'sqc-d-' state.ign_pfx = [ '.', '_', 'sqc-d-', 'sqc-w-', 'sqc-m-', 'sqc-y-' ] elif time_unit == 'w': run_int = timedelta(weeks=timeint) state.prefix = 'sqc-w-' state.ign_pfx = ['.', '_', 'sqc-w-', 'sqc-m-', 'sqc-y-'] else: logging.error(f'Invalid unit for period, {time_unit}, ' 'must be one of h/d/w') except ValueError: logging.error(f'Invalid time, {period}') return state.period = run_int # Create list of tables to coalesce. # TODO: Verify that we're only coalescing parquet tables here if tables: tables = [ x for x in tables if schemas.tables() and ( schemas.type_for_table(x) != "derivedRecord") ] else: tables = [ x for x in schemas.tables() if schemas.type_for_table(x) != "derivedRecord" ] if 'sqPoller' not in tables and not ign_sqpoller: # This is an error. sqPoller keeps track of discontinuities # among other things. self.logger.error( 'No sqPoller data, cannot compute discontinuities') return else: # We want sqPoller to be first to compute discontinuities with suppress(ValueError): tables.remove('sqPoller') if not ign_sqpoller: tables.insert(0, 'sqPoller') # We've forced the sqPoller to be always the first table to coalesce stats = [] for entry in tables: table_outfolder = f'{outfolder}/{entry}' table_infolder = f'{infolder}//{entry}' if archive_folder: table_archive_folder = f'{archive_folder}/{entry}' else: table_archive_folder = None state.current_df = pd.DataFrame() state.dbeng = self state.schema = SchemaForTable(entry, schemas, None) if not os.path.isdir(table_infolder): self.logger.info(f'No input records to coalesce for {entry}') continue try: if not os.path.isdir(table_outfolder): os.makedirs(table_outfolder) if (table_archive_folder and not os.path.isdir(table_archive_folder)): os.makedirs(table_archive_folder, exist_ok=True) # Migrate the data if needed self.logger.debug(f'Migrating data for {entry}') self.migrate(entry, state.schema) self.logger.debug(f'Migrating data for {entry}') start = time() coalesce_resource_table(table_infolder, table_outfolder, table_archive_folder, entry, state) end = time() self.logger.info( f'coalesced {state.wrfile_count} files/{state.wrrec_count} ' f'records of {entry}') stats.append( SqCoalesceStats( entry, period, int(end - start), state.wrfile_count, state.wrrec_count, int(datetime.now(tz=timezone.utc).timestamp() * 1000))) except Exception: self.logger.exception(f'Unable to coalesce table {entry}') stats.append( SqCoalesceStats( entry, period, int(end - start), 0, 0, int(datetime.now(tz=timezone.utc).timestamp() * 1000))) return stats
False) if not fd: print(f'ERROR: Another coalescer process present') logger.error(f'Another coalescer process present') sys.exit(errno.EBUSY) if userargs.run_once: timestr = '' elif not userargs.period: timestr = cfg.get('coalescer', {'period': '1h'}).get('period', '1h') else: timestr = userargs.period schemas = Schema(cfg.get('schema-directory')) if userargs.service_only or userargs.exclude_services: tables = [x for x in schemas.tables() if (schemas.type_for_table(x) != "derivedRecord")] if userargs.service_only: tables = [x for x in tables if x in userargs.service_only.split()] if userargs.exclude_services: tables = [x for x in tables if x not in userargs.exclude_services.split()] else: tables = [] run_coalescer(cfg, tables, timestr, userargs.run_once, logger, userargs.no_sqpoller or False) os.truncate(fd, 0) try: fcntl.flock(fd, fcntl.LOCK_UN) os.close(fd)