Beispiel #1
0
 def on_connected(self, *args, **kwargs):
     if self._args.config:
         self.cfg = load_sq_config(validate=True,
                                   config_file=self._args.config)
     else:
         self.cfg = load_sq_config(validate=True)
     if not self.cfg:
         sys.exit(1)
     self.schemas = Schema(self.cfg["schema-directory"])
Beispiel #2
0
def run_coalescer(cfg: dict,
                  tables: List[str],
                  period: str,
                  run_once: bool,
                  logger: Logger,
                  no_sqpoller: bool = False) -> None:
    """Run the coalescer.

    Runs it once and returns or periodically depending on the
    value of run_once. It also writes out the coalescer records
    as a parquet file.

    :param cfg: dict, the Suzieq config file read in
    :param tables: List[str], list of table names to coalesce
    :param period: str, the string of how periodically the poller runs,
                   Examples are '1h', '1d' etc.
    :param run_once: bool, True if you want the poller to run just once
    :param logger: logging.Logger, the logger to write logs to
    :param no_sqpoller: bool, write records even when there's no sqpoller rec
    :returns: Nothing
    :rtype: none

    """

    try:
        schemas = Schema(cfg['schema-directory'])
    except Exception as ex:
        logger.error(f'Aborting. Unable to load schema: {str(ex)}')
        print(f'ERROR: Aborting. Unable to load schema: {str(ex)}')
        sys.exit(1)

    coalescer_schema = SchemaForTable('sqCoalescer', schemas)
    pqdb = get_sqdb_engine(cfg, 'sqCoalescer', None, logger)
    if not run_once:
        now = datetime.now()
        nextrun = parse(period, settings={'PREFER_DATES_FROM': 'future'})
        sleep_time = (nextrun - now).seconds
        logger.info(f'Got sleep time of {sleep_time} secs')

    while True:
        try:
            stats = do_coalesce(cfg, tables, period, logger, no_sqpoller)
        except Exception:
            logger.exception('Coalescer aborted. Continuing')
        # Write the selftats
        df = pd.DataFrame([asdict(x) for x in stats])
        if not df.empty:
            df['sqvers'] = coalescer_schema.version
            df['version'] = SUZIEQ_VERSION
            df['active'] = True
            df['namespace'] = ''
            pqdb.write('sqCoalescer', 'pandas', df, True,
                       coalescer_schema.get_arrow_schema(), None)

        if run_once:
            break
        sleep(sleep_time)
Beispiel #3
0
    def __init__(self, engine, config_file=None):
        self.cfg = load_sq_config(config_file=config_file)

        self.schemas = Schema(self.cfg['schema-directory'])

        self.namespace = ''
        self.hostname = ''
        self.start_time = ''
        self.end_time = ''
        self.exec_time = ''
        self.engine = engine
        self.sort_fields = []
    def __init__(self, engine="pandas"):
        self.cfg = load_sq_config(validate=False)

        self.schemas = Schema(self.cfg["schema-directory"])

        self.namespace = ""
        self.hostname = ""
        self.start_time = ""
        self.end_time = ""
        self.exec_time = ""
        self.engine_name = engine
        self.sort_fields = []
        self.engine = get_sqengine(engine)
        super().__init__()
Beispiel #5
0
    def __init__(self, engine):
        self.cfg = load_sq_config(validate=False)

        self.schemas = Schema(self.cfg['schema-directory'])

        self.namespace = ''
        self.hostname = ''
        self.start_time = ''
        self.end_time = ''
        self.exec_time = ''
        self.engine = 'pandas'
        self.sort_fields = []
        self.engine = get_sqengine(self.engine)
        if not self.engine:
            # We really should define our own error
            raise ValueError
 def on_connected(self, *args, **kwargs):
     if self._args.config:
         self.cfg = load_sq_config(validate=False,
                                   config_file=self._args.config)
         self.schemas = Schema(self.cfg["schema-directory"])
Beispiel #7
0
async def init_services(svc_dir: str, schema_dir: str, queue, svclist: list,
                        def_interval: int, run_once: str):
    """Process service definitions by reading each file in svc dir"""

    svcs_list = []
    schemas = defaultdict(dict)

    # Load up all the service definitions we can find
    svc_classes = {}
    for i in walk_packages(path=[dirname(getfile(Service))]):
        for mbr in getmembers(
                importlib.import_module('suzieq.poller.services.' + i.name),
                isclass):
            if mbr[0] == "Service" or not mbr[0].endswith("Service"):
                continue
            svc_classes[i.name] = mbr[1]
            svc_classes[mbr[0]] = mbr[1]

    if not isdir(svc_dir):
        logger.error("services directory not a directory: {}".format(svc_dir))
        return svcs_list

    if not isdir(schema_dir):
        logger.error("schema directory not a directory: {}".format(svc_dir))
        return svcs_list
    else:
        schemas = Schema(schema_dir)

    if schemas:
        poller_schema = schemas.get_arrow_schema("sqPoller")
        poller_schema_version = SchemaForTable('sqPoller', schemas).version

    for root, _, filenames in walk(svc_dir):
        for filename in filenames:
            if filename.endswith(".yml"):
                with open(root + "/" + filename, "r") as f:
                    svc_def = yaml.safe_load(f.read())
                if svc_def.get('service') not in svclist:
                    logger.warning(
                        f'Ignoring unspecified service {svc_def.get("service")}'
                    )
                    continue

                if "service" not in svc_def or "apply" not in svc_def:
                    logger.error('Ignoring invalid service file definition. \
                    Need both "service" and "apply" keywords: {}'.format(
                        filename))
                    continue

                period = svc_def.get("period", def_interval)
                for elem, val in svc_def["apply"].items():
                    if "copy" in val:
                        newval = svc_def["apply"].get(val["copy"], None)
                        if not newval:
                            logger.error("No device type {} to copy from for "
                                         "{} for service {}".format(
                                             val["copy"], elem,
                                             svc_def["service"]))
                            continue
                        val = newval

                    if (("command" not in val) or
                        ((isinstance(val['command'], list)
                          and not all('textfsm' in x or 'normalize' in x
                                      for x in val['command'])) or
                         (not isinstance(val['command'], list) and
                          ("normalize" not in val and "textfsm" not in val)))):
                        logger.error(
                            "Ignoring invalid service file "
                            'definition. Need both "command" and '
                            '"normalize/textfsm" keywords: {}, {}'.format(
                                filename, val))
                        continue

                    if "textfsm" in val:
                        # We may have already visited this element and parsed
                        # the textfsm file. Check for this
                        if val["textfsm"] and isinstance(
                                val["textfsm"], textfsm.TextFSM):
                            continue
                        tfsm_file = svc_dir + "/" + val["textfsm"]
                        if not isfile(tfsm_file):
                            logger.error("Textfsm file {} not found. Ignoring"
                                         " service".format(tfsm_file))
                            continue
                        with open(tfsm_file, "r") as f:
                            tfsm_template = textfsm.TextFSM(f)
                            val["textfsm"] = tfsm_template
                    elif (isinstance(val['command'], list)):
                        for subelem in val['command']:
                            if 'textfsm' in subelem:
                                if subelem["textfsm"] and isinstance(
                                        subelem["textfsm"], textfsm.TextFSM):
                                    continue
                                tfsm_file = svc_dir + "/" + subelem["textfsm"]
                                if not isfile(tfsm_file):
                                    logger.error(
                                        "Textfsm file {} not found. Ignoring"
                                        " service".format(tfsm_file))
                                    continue
                                with open(tfsm_file, "r") as f:
                                    tfsm_template = textfsm.TextFSM(f)
                                    subelem["textfsm"] = tfsm_template
                    else:
                        tfsm_template = None

                try:
                    schema = SchemaForTable(svc_def['service'], schema=schemas)
                except Exception:
                    logger.error(
                        f"No matching schema for {svc_def['service']}")
                    continue

                if schema.type == "derivedRecord":
                    # These are not real services and so ignore them
                    continue

                # Valid service definition, add it to list
                if svc_def["service"] in svc_classes:
                    service = svc_classes[svc_def["service"]](
                        svc_def["service"],
                        svc_def["apply"],
                        period,
                        svc_def.get("type", "state"),
                        svc_def.get("keys", []),
                        svc_def.get("ignore-fields", []),
                        schema,
                        queue,
                        run_once,
                    )
                else:
                    service = Service(svc_def["service"], svc_def["apply"],
                                      period, svc_def.get("type", "state"),
                                      svc_def.get("keys", []),
                                      svc_def.get("ignore-fields",
                                                  []), schema, queue, run_once)

                service.poller_schema = poller_schema
                service.poller_schema_version = poller_schema_version
                logger.info("Service {} added".format(service.name))
                svcs_list.append(service)

    return svcs_list
Beispiel #8
0
def create_context():
    config = load_sq_config(config_file=create_dummy_config_file())
    context = NubiaSuzieqContext()
    context.cfg = config
    context.schemas = Schema(config["schema-directory"])
    return context
Beispiel #9
0
def test_transform(input_file):
    to_transform = Yaml2Class(input_file)

    try:
        data_directory = to_transform.transform.data_directory
    except AttributeError:
        print('Invalid transformation file, no data directory')
        pytest.fail('AttributeError', pytrace=True)

    #  Make a copy of the data directory
    temp_dir, tmpfile = _coalescer_init(data_directory)

    cfg = load_sq_config(config_file=tmpfile.name)
    schemas = Schema(cfg['schema-directory'])

    for ele in to_transform.transform.transform:
        query_str_list = []
        # Each transformation has a record => write's happen per record
        for record in ele.record:
            changed_fields = set()
            new_df = pd.DataFrame()
            tables = [x for x in dir(record) if not x.startswith('_')]
            for table in tables:
                # Lets read the data in now that we know the table
                tblobj = get_sqobject(table)
                pq_db = get_sqdb_engine(cfg, table, None, None)
                columns = schemas.fields_for_table(table)
                mod_df = tblobj(config_file=tmpfile.name).get(columns=columns)

                for key in getattr(record, table):
                    query_str = key.match
                    chg_df = pd.DataFrame()
                    if query_str != "all":
                        try:
                            chg_df = mod_df.query(query_str) \
                                           .reset_index(drop=True)
                        except Exception as ex:
                            assert (not ex)
                        query_str_list.append(query_str)
                    else:
                        chg_df = mod_df

                    _process_transform_set(key.set, chg_df, changed_fields)
                    if new_df.empty:
                        new_df = chg_df
                    elif not chg_df.empty:
                        new_df = pd.concat([new_df, chg_df])

                if new_df.empty:
                    continue

                # Write the records now
                _write_verify_transform(new_df, table, pq_db,
                                        SchemaForTable(table,
                                                       schemas), tmpfile.name,
                                        query_str_list, changed_fields)

    # Now we coalesce and verify it works
    from suzieq.sqobjects.tables import TablesObj

    pre_table_df = TablesObj(config_file=tmpfile.name).get()
    do_coalesce(cfg, None)
    _verify_coalescing(temp_dir)

    post_table_df = TablesObj(config_file=tmpfile.name).get()
    assert_df_equal(pre_table_df, post_table_df, None)

    # Run additional tests on the coalesced data
    for ele in to_transform.transform.verify:
        table = [x for x in dir(ele) if not x.startswith('_')][0]
        tblobj = get_sqobject(table)

        for tst in getattr(ele, table):
            start_time = tst.test.get('start-time', '')
            end_time = tst.test.get('end-time', '')

            columns = tst.test.get('columns', ['default'])
            df = tblobj(config_file=tmpfile.name,
                        start_time=start_time,
                        end_time=end_time).get(columns=columns)
            if not df.empty and 'query' in tst.test:
                query_str = tst.test['query']
                df = df.query(query_str).reset_index(drop=True)

            if 'assertempty' in tst.test:
                assert (df.empty)
            elif 'shape' in tst.test:
                shape = tst.test['shape'].split()
                if shape[0] != '*':
                    assert (int(shape[0]) == df.shape[0])
                if shape[1] != '*':
                    assert (int(shape[1]) == df.shape[1])
            else:
                assert (not df.empty)

    _coalescer_cleanup(temp_dir, tmpfile)
Beispiel #10
0
    def coalesce(self,
                 tables: List[str] = [],
                 period: str = '',
                 ign_sqpoller: bool = False) -> None:
        """Coalesce all the resource parquet files in specified folder.

        This routine does not run periodically. It runs once and returns.

        :param tables: List[str], List of specific tables to coalesce, empty for all
        :param period: str, coalescing period, needed for various internal stuff
        :param ign_sqpoller: True if its OK to ignore the absence of sqpoller to
                             coalesce
        :returns: coalesce statistics list, one per table
        :rtype: SqCoalesceStats
        """

        infolder = self.cfg['data-directory']
        outfolder = self._get_table_directory('', True)  # root folder
        archive_folder = self.cfg.get('coalescer', {}) \
                                 .get('archive-directory',
                                      f'{infolder}/_archived')

        if not period:
            period = self.cfg.get('coalesceer', {
                'period': '1h'
            }).get('period', '1h')
        schemas = Schema(self.cfg.get('schema-directory'))
        state = SqCoalesceState(self.logger, period)

        state.logger = self.logger
        # Trying to be complete here. the ignore prefixes assumes you have coalesceers
        # across multiple time periods running, and so we need to ignore the files
        # created by the longer time period coalesceions. In other words, weekly
        # coalesceer should ignore monthly and yearly coalesced files, monthly
        # coalesceer should ignore yearly coalesceer and so on.
        try:
            timeint = int(period[:-1])
            time_unit = period[-1]
            if time_unit == 'h':
                run_int = timedelta(hours=timeint)
                state.prefix = 'sqc-h-'
                state.ign_pfx = ['.', '_', 'sqc-']
            elif time_unit == 'd':
                run_int = timedelta(days=timeint)
                if timeint > 364:
                    state.prefix = 'sqc-y-'
                    state.ign_pfx = ['.', '_', 'sqc-y-']
                elif timeint > 29:
                    state.prefix = 'sqc-m-'
                    state.ign_pfx = ['.', '_', 'sqc-m-', 'sqc-y-']
                else:
                    state.prefix = 'sqc-d-'
                    state.ign_pfx = [
                        '.', '_', 'sqc-d-', 'sqc-w-', 'sqc-m-', 'sqc-y-'
                    ]
            elif time_unit == 'w':
                run_int = timedelta(weeks=timeint)
                state.prefix = 'sqc-w-'
                state.ign_pfx = ['.', '_', 'sqc-w-', 'sqc-m-', 'sqc-y-']
            else:
                logging.error(f'Invalid unit for period, {time_unit}, '
                              'must be one of h/d/w')
        except ValueError:
            logging.error(f'Invalid time, {period}')
            return

        state.period = run_int
        # Create list of tables to coalesce.
        # TODO: Verify that we're only coalescing parquet tables here
        if tables:
            tables = [
                x for x in tables if schemas.tables() and (
                    schemas.type_for_table(x) != "derivedRecord")
            ]
        else:
            tables = [
                x for x in schemas.tables()
                if schemas.type_for_table(x) != "derivedRecord"
            ]
        if 'sqPoller' not in tables and not ign_sqpoller:
            # This is an error. sqPoller keeps track of discontinuities
            # among other things.
            self.logger.error(
                'No sqPoller data, cannot compute discontinuities')
            return
        else:
            # We want sqPoller to be first to compute discontinuities
            with suppress(ValueError):
                tables.remove('sqPoller')
            if not ign_sqpoller:
                tables.insert(0, 'sqPoller')

        # We've forced the sqPoller to be always the first table to coalesce
        stats = []
        for entry in tables:
            table_outfolder = f'{outfolder}/{entry}'
            table_infolder = f'{infolder}//{entry}'
            if archive_folder:
                table_archive_folder = f'{archive_folder}/{entry}'
            else:
                table_archive_folder = None
            state.current_df = pd.DataFrame()
            state.dbeng = self
            state.schema = SchemaForTable(entry, schemas, None)
            if not os.path.isdir(table_infolder):
                self.logger.info(f'No input records to coalesce for {entry}')
                continue
            try:
                if not os.path.isdir(table_outfolder):
                    os.makedirs(table_outfolder)
                if (table_archive_folder
                        and not os.path.isdir(table_archive_folder)):
                    os.makedirs(table_archive_folder, exist_ok=True)
                # Migrate the data if needed
                self.logger.debug(f'Migrating data for {entry}')
                self.migrate(entry, state.schema)
                self.logger.debug(f'Migrating data for {entry}')
                start = time()
                coalesce_resource_table(table_infolder, table_outfolder,
                                        table_archive_folder, entry, state)
                end = time()
                self.logger.info(
                    f'coalesced {state.wrfile_count} files/{state.wrrec_count} '
                    f'records of {entry}')
                stats.append(
                    SqCoalesceStats(
                        entry, period, int(end - start), state.wrfile_count,
                        state.wrrec_count,
                        int(datetime.now(tz=timezone.utc).timestamp() * 1000)))
            except Exception:
                self.logger.exception(f'Unable to coalesce table {entry}')
                stats.append(
                    SqCoalesceStats(
                        entry, period, int(end - start), 0, 0,
                        int(datetime.now(tz=timezone.utc).timestamp() * 1000)))

        return stats
Beispiel #11
0
def coalescer_main():

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-s",
        "--service-only",
        type=str,
        help="Only run this space separated list of services",
    )
    parser.add_argument(
        "-x",
        "--exclude-services",
        type=str,
        help="Exclude running this space separated list of services",
    )

    parser.add_argument("-c",
                        "--config",
                        default=f'{os.getenv("HOME")}/.suzieq/suzieq-cfg.yml',
                        type=str,
                        help="alternate config file")
    parser.add_argument(
        "--run-once",
        default=False,
        help='Run the coalescer once and exit',
        action='store_true',
    )
    parser.add_argument(
        "-p",
        "--period",
        type=str,
        help=('Override the period specified in config file with this. '
              'Format is <period><h|d|w|y>. 1h is 1 hour, 2w is 2 weeks etc.'))
    parser.add_argument("--no-sqpoller",
                        action='store_true',
                        help=argparse.SUPPRESS)

    userargs = parser.parse_args()

    cfg = load_sq_config(config_file=userargs.config)
    if not cfg:
        print(f'Invalid Suzieq config file {userargs.config}')
        sys.exit(1)

    logfile, loglevel = get_log_file_level('coalescer', cfg,
                                           '/tmp/sq-coalescer.log')
    logger = init_logger('suzieq.coalescer', logfile, loglevel, False)

    # Ensure we're the only compacter
    coalesce_dir = cfg.get('coalescer', {})\
                      .get('coalesce-directory',
                           f'{cfg.get("data-directory")}/coalesced')

    fd = ensure_single_instance(f'{coalesce_dir}/.sq-coalescer.pid', False)
    if not fd:
        print(f'ERROR: Another coalescer process present')
        logger.error(f'Another coalescer process present')
        sys.exit(errno.EBUSY)

    if userargs.run_once:
        timestr = ''
    elif not userargs.period:
        timestr = cfg.get('coalescer', {'period': '1h'}).get('period', '1h')
    else:
        timestr = userargs.period

    schemas = Schema(cfg.get('schema-directory'))
    if userargs.service_only or userargs.exclude_services:
        tables = [
            x for x in schemas.tables()
            if (schemas.type_for_table(x) != "derivedRecord")
        ]
        if userargs.service_only:
            tables = [x for x in tables if x in userargs.service_only.split()]
        if userargs.exclude_services:
            tables = [
                x for x in tables
                if x not in userargs.exclude_services.split()
            ]
    else:
        tables = []

    run_coalescer(cfg, tables, timestr, userargs.run_once, logger,
                  userargs.no_sqpoller or False)
    os.truncate(fd, 0)
    try:
        fcntl.flock(fd, fcntl.LOCK_UN)
        os.close(fd)
    except OSError:
        pass

    sys.exit(0)
Beispiel #12
0
    fd = ensure_single_instance(f'{coalesce_dir}/.sq-coalescer.pid',
                                False)
    if not fd:
        print(f'ERROR: Another coalescer process present')
        logger.error(f'Another coalescer process present')
        sys.exit(errno.EBUSY)

    if userargs.run_once:
        timestr = ''
    elif not userargs.period:
        timestr = cfg.get('coalescer', {'period': '1h'}).get('period', '1h')
    else:
        timestr = userargs.period

    schemas = Schema(cfg.get('schema-directory'))
    if userargs.service_only or userargs.exclude_services:
        tables = [x for x in schemas.tables()
                  if (schemas.type_for_table(x) != "derivedRecord")]
        if userargs.service_only:
            tables = [x for x in tables if x in userargs.service_only.split()]
        if userargs.exclude_services:
            tables = [x for x in tables
                      if x not in userargs.exclude_services.split()]
    else:
        tables = []

    run_coalescer(cfg, tables, timestr, userargs.run_once,
                  logger, userargs.no_sqpoller or False)
    os.truncate(fd, 0)
    try:
Beispiel #13
0
    if 'norifcnReason' in df.columns:
        df.rename({'notifcnReason': 'notificnReason'}, inplace=True)

    pq.write_to_dataset(
        table,
        root_path=output_dir,
        partition_cols=partition_cols,
        version="2.0",
        compression='ZSTD',
        row_group_size=100000,
    )

    logger.info(f'Wrote converted {input_dir}')


if __name__ == "__main__":
    if len(sys.argv) < 4:
        print('Usage: convert_parquet <input dir> <output_dir> <schema_dir>')
        sys.exit(1)

    input_dir = Path(sys.argv[1])
    output_dir = sys.argv[2]
    schemas = Schema(sys.argv[3])
    service = input_dir.parts[-1]
    svc_schema = SchemaForTable(service, schema=schemas)

    logging.basicConfig(stream=sys.stdout, level=logging.WARNING)
    logger = logging.getLogger('sq-converter')
    convert_dir(input_dir, output_dir, svc_schema)