Beispiel #1
0
    def top(self, what='', n=5, reverse=False, **kwargs) -> pd.DataFrame:
        """Get the list of top/bottom entries of "what" field"""

        if "columns" in kwargs:
            columns = kwargs["columns"]
            del kwargs["columns"]
        else:
            columns = ["default"]

        # if self._valid_get_args:
        #     self._valid_get_args += ['what', 'n', 'reverse']
        # This raises exceptions if it fails
        try:
            self.validate_get_input(**kwargs)
        except Exception as error:
            df = pd.DataFrame({'error': [f'{error}']})
            return df

        table_schema = SchemaForTable(self._table, self.all_schemas)
        columns = table_schema.get_display_fields(columns)

        if what not in columns:
            self._addnl_fields.append(what)

        return self.engine.top(what=what, n=n, reverse=reverse, **kwargs)
Beispiel #2
0
    def __init__(self, engine_name: str = 'pandas',
                 hostname: typing.List[str] = [],
                 start_time: str = '', end_time: str = '',
                 view: str = 'latest', namespace: typing.List[str] = [],
                 columns: typing.List[str] = ['default'],
                 context=None, table: str = '', config_file=None) -> None:

        if context is None:
            self.ctxt = SqContext(engine_name, config_file)
        else:
            self.ctxt = context
            if not self.ctxt:
                self.ctxt = SqContext(engine_name)

        self._cfg = self.ctxt.cfg
        self._schema = SchemaForTable(table, self.ctxt.schemas)
        self._table = table
        self._sort_fields = self._schema.key_fields()

        if not namespace and self.ctxt.namespace:
            self.namespace = self.ctxt.namespace
        else:
            self.namespace = namespace
        if not hostname and self.ctxt.hostname:
            self.hostname = self.ctxt.hostname
        else:
            self.hostname = hostname

        if not start_time and self.ctxt.start_time:
            self.start_time = self.ctxt.start_time
        else:
            self.start_time = start_time

        if not end_time and self.ctxt.end_time:
            self.end_time = self.ctxt.end_time
        else:
            self.end_time = end_time

        if not view and self.ctxt.view:
            self.view = self.ctxt.view
        else:
            self.view = view
        self.columns = columns

        if engine_name and engine_name != '':
            self.engine = get_sqengine(engine_name,
                                       self._table)(self._table, self)
        elif self.ctxt.engine:
            self.engine = get_sqengine(self.ctxt.engine,
                                       self._table)(self._table, self)

        if not self.engine:
            raise ValueError('Unknown analysis engine')

        self._addnl_filter = None
        self._addnl_fields = []
        self._valid_get_args = None
        self._valid_assert_args = None
        self._valid_arg_vals = None
Beispiel #3
0
def run_coalescer(cfg: dict,
                  tables: List[str],
                  period: str,
                  run_once: bool,
                  logger: Logger,
                  no_sqpoller: bool = False) -> None:
    """Run the coalescer.

    Runs it once and returns or periodically depending on the
    value of run_once. It also writes out the coalescer records
    as a parquet file.

    :param cfg: dict, the Suzieq config file read in
    :param tables: List[str], list of table names to coalesce
    :param period: str, the string of how periodically the poller runs,
                   Examples are '1h', '1d' etc.
    :param run_once: bool, True if you want the poller to run just once
    :param logger: logging.Logger, the logger to write logs to
    :param no_sqpoller: bool, write records even when there's no sqpoller rec
    :returns: Nothing
    :rtype: none

    """

    try:
        schemas = Schema(cfg['schema-directory'])
    except Exception as ex:
        logger.error(f'Aborting. Unable to load schema: {str(ex)}')
        print(f'ERROR: Aborting. Unable to load schema: {str(ex)}')
        sys.exit(1)

    coalescer_schema = SchemaForTable('sqCoalescer', schemas)
    pqdb = get_sqdb_engine(cfg, 'sqCoalescer', None, logger)
    if not run_once:
        now = datetime.now()
        nextrun = parse(period, settings={'PREFER_DATES_FROM': 'future'})
        sleep_time = (nextrun - now).seconds
        logger.info(f'Got sleep time of {sleep_time} secs')

    while True:
        try:
            stats = do_coalesce(cfg, tables, period, logger, no_sqpoller)
        except Exception:
            logger.exception('Coalescer aborted. Continuing')
        # Write the selftats
        df = pd.DataFrame([asdict(x) for x in stats])
        if not df.empty:
            df['sqvers'] = coalescer_schema.version
            df['version'] = SUZIEQ_VERSION
            df['active'] = True
            df['namespace'] = ''
            pqdb.write('sqCoalescer', 'pandas', df, True,
                       coalescer_schema.get_arrow_schema(), None)

        if run_once:
            break
        sleep(sleep_time)
Beispiel #4
0
def convert_dir(input_dir: str, output_dir: str, svcschema: SchemaForTable):
    """Convert the data into a single file and write it out"""

    defaults = {
        pa.string(): "",
        pa.int32(): 0,
        pa.int64(): 0,
        pa.float32(): 0.0,
        pa.float64(): 0.0,
        pa.date64(): 0.0,
        pa.bool_(): False,
        pa.list_(pa.string()): ['-'],
        pa.list_(pa.int64()): [],
    }

    df = pd.read_parquet(input_dir, use_legacy_dataset=True)
    sqschema = svcschema.get_raw_schema()
    arrow_schema = svc_schema.get_arrow_schema()

    for column in filter(lambda x: x['name'] not in df.columns, sqschema):
        df[column['name']] = column.get('default', defaults[column['type']])

    # convert all dtypes to whatever is desired
    for column in df.columns:
        if column in arrow_schema:
            df[column] = df[column].astype(
                arrow_schema.field(column).type.to_pandas_dtype())

    # If there's the original ifname saved up, then eliminate this unnecessary
    # field as this model is no longer necessary

    if 'origIfname' in df.columns:
        if 'ifname' in df.columns:
            df = df.drop(columns=['ifname']) \
                   .rename(columns={'origIfname': 'ifname'})
        elif 'oif' in df.columns:
            df = df.drop(columns=['oif']) \
                   .rename(columns={'origIfname': 'oif'})

    table = pa.Table.from_pandas(df, schema=arrow_schema, preserve_index=False)
    partition_cols = svcschema.get_partition_columns()

    if 'norifcnReason' in df.columns:
        df.rename({'notifcnReason': 'notificnReason'}, inplace=True)

    pq.write_to_dataset(
        table,
        root_path=output_dir,
        partition_cols=partition_cols,
        version="2.0",
        compression='ZSTD',
        row_group_size=100000,
    )

    logger.info(f'Wrote converted {input_dir}')
Beispiel #5
0
    def get_table_info(self, table, **kwargs):
        sch = SchemaForTable(table, schema=self.schemas)
        key_fields = sch.key_fields()
        # You can't use view from user because we need to see all the data
        # to compute data required.
        kwargs.pop('view', None)
        all_time_df = self._get_table_info(table, view='all', **kwargs)
        times = all_time_df['timestamp'].unique()
        ret = {'first_time': all_time_df.timestamp.min(),
               'latest_time': all_time_df.timestamp.max(),
               'intervals': len(times),
               'all rows': len(all_time_df),
               'namespaces': self._unique_or_zero(all_time_df, 'namespace'),
               'devices': self._unique_or_zero(all_time_df, 'hostname')}

        return ret
Beispiel #6
0
    def top(self, what='', n=5, reverse=False, **kwargs) -> pd.DataFrame:
        """Get the list of top/bottom entries of "what" field"""

        if "columns" in kwargs:
            columns = kwargs["columns"]
            del kwargs["columns"]
        else:
            columns = ["default"]

        table_schema = SchemaForTable(self._table, self.all_schemas)
        columns = table_schema.get_display_fields(columns)

        if what == "numChanges" and what not in columns:
            self._addnl_nbr_fields.append(what)

        return self.engine_obj.top(what=what, n=n, reverse=reverse, **kwargs)
Beispiel #7
0
    def get(self, **kwargs):
        """Replacing the original interface name in returned result"""

        addnl_fields = kwargs.pop('addnl_fields', [])
        columns = kwargs.get('columns', ['default'])
        vrf = kwargs.pop('vrf', None)
        peer = kwargs.pop('peer', None)
        hostname = kwargs.pop('hostname', None)

        drop_cols = ['origPeer', 'peerHost']
        addnl_fields.extend(['origPeer'])
        sch = SchemaForTable(self.iobj.table, self.schemas)
        fields = sch.get_display_fields(columns)

        for col in ['peerIP', 'updateSource', 'state', 'namespace', 'vrf',
                    'peer', 'hostname']:
            if col not in fields:
                addnl_fields.append(col)
                drop_cols.append(col)

        df = super().get(addnl_fields=addnl_fields, **kwargs)

        if df.empty:
            return df

        query_str = build_query_str([], sch, vrf=vrf, peer=peer,
                                    hostname=hostname)
        if 'peer' in df.columns:
            df['peer'] = np.where(df['origPeer'] != "",
                                  df['origPeer'], df['peer'])
        if 'peerHostname' in df.columns:
            mdf = self._get_peer_matched_df(df)
            drop_cols = [x for x in drop_cols if x in mdf.columns]
            drop_cols.extend(list(mdf.filter(regex='_y')))
        else:
            mdf = df

        if query_str:
            return mdf.query(query_str).drop(columns=drop_cols,
                                             errors='ignore')
        else:
            return mdf.drop(columns=drop_cols, errors='ignore')
Beispiel #8
0
    def describe(self, **kwargs):
        """Describes the fields for a given table"""

        table = kwargs.get('table', '')
        try:
            sch = SchemaForTable(table, self.schemas)
        except ValueError:
            sch = None
        if not sch:
            df = pd.DataFrame(
                {'error': [f'ERROR: incorrect table name {table}']})
            return df

        entries = [{
            'name': x['name'],
            'type': x['type'],
            'key': x.get('key', ''),
            'display': x.get('display', '')
        } for x in sch.get_raw_schema()]
        df = pd.DataFrame.from_dict(entries).sort_values('name')

        return df
Beispiel #9
0
    def coalesce(self,
                 tables: List[str] = [],
                 period: str = '',
                 ign_sqpoller: bool = False) -> None:
        """Coalesce all the resource parquet files in specified folder.

        This routine does not run periodically. It runs once and returns.

        :param tables: List[str], List of specific tables to coalesce, empty for all
        :param period: str, coalescing period, needed for various internal stuff
        :param ign_sqpoller: True if its OK to ignore the absence of sqpoller to
                             coalesce
        :returns: coalesce statistics list, one per table
        :rtype: SqCoalesceStats
        """

        infolder = self.cfg['data-directory']
        outfolder = self._get_table_directory('', True)  # root folder
        archive_folder = self.cfg.get('coalescer', {}) \
                                 .get('archive-directory',
                                      f'{infolder}/_archived')

        if not period:
            period = self.cfg.get('coalesceer', {
                'period': '1h'
            }).get('period', '1h')
        schemas = Schema(self.cfg.get('schema-directory'))
        state = SqCoalesceState(self.logger, period)

        state.logger = self.logger
        # Trying to be complete here. the ignore prefixes assumes you have coalesceers
        # across multiple time periods running, and so we need to ignore the files
        # created by the longer time period coalesceions. In other words, weekly
        # coalesceer should ignore monthly and yearly coalesced files, monthly
        # coalesceer should ignore yearly coalesceer and so on.
        try:
            timeint = int(period[:-1])
            time_unit = period[-1]
            if time_unit == 'h':
                run_int = timedelta(hours=timeint)
                state.prefix = 'sqc-h-'
                state.ign_pfx = ['.', '_', 'sqc-']
            elif time_unit == 'd':
                run_int = timedelta(days=timeint)
                if timeint > 364:
                    state.prefix = 'sqc-y-'
                    state.ign_pfx = ['.', '_', 'sqc-y-']
                elif timeint > 29:
                    state.prefix = 'sqc-m-'
                    state.ign_pfx = ['.', '_', 'sqc-m-', 'sqc-y-']
                else:
                    state.prefix = 'sqc-d-'
                    state.ign_pfx = [
                        '.', '_', 'sqc-d-', 'sqc-w-', 'sqc-m-', 'sqc-y-'
                    ]
            elif time_unit == 'w':
                run_int = timedelta(weeks=timeint)
                state.prefix = 'sqc-w-'
                state.ign_pfx = ['.', '_', 'sqc-w-', 'sqc-m-', 'sqc-y-']
            else:
                logging.error(f'Invalid unit for period, {time_unit}, '
                              'must be one of h/d/w')
        except ValueError:
            logging.error(f'Invalid time, {period}')
            return

        state.period = run_int
        # Create list of tables to coalesce.
        # TODO: Verify that we're only coalescing parquet tables here
        if tables:
            tables = [
                x for x in tables if schemas.tables() and (
                    schemas.type_for_table(x) != "derivedRecord")
            ]
        else:
            tables = [
                x for x in schemas.tables()
                if schemas.type_for_table(x) != "derivedRecord"
            ]
        if 'sqPoller' not in tables and not ign_sqpoller:
            # This is an error. sqPoller keeps track of discontinuities
            # among other things.
            self.logger.error(
                'No sqPoller data, cannot compute discontinuities')
            return
        else:
            # We want sqPoller to be first to compute discontinuities
            with suppress(ValueError):
                tables.remove('sqPoller')
            if not ign_sqpoller:
                tables.insert(0, 'sqPoller')

        # We've forced the sqPoller to be always the first table to coalesce
        stats = []
        for entry in tables:
            table_outfolder = f'{outfolder}/{entry}'
            table_infolder = f'{infolder}//{entry}'
            if archive_folder:
                table_archive_folder = f'{archive_folder}/{entry}'
            else:
                table_archive_folder = None
            state.current_df = pd.DataFrame()
            state.dbeng = self
            state.schema = SchemaForTable(entry, schemas, None)
            if not os.path.isdir(table_infolder):
                self.logger.info(f'No input records to coalesce for {entry}')
                continue
            try:
                if not os.path.isdir(table_outfolder):
                    os.makedirs(table_outfolder)
                if (table_archive_folder
                        and not os.path.isdir(table_archive_folder)):
                    os.makedirs(table_archive_folder, exist_ok=True)
                # Migrate the data if needed
                self.logger.debug(f'Migrating data for {entry}')
                self.migrate(entry, state.schema)
                self.logger.debug(f'Migrating data for {entry}')
                start = time()
                coalesce_resource_table(table_infolder, table_outfolder,
                                        table_archive_folder, entry, state)
                end = time()
                self.logger.info(
                    f'coalesced {state.wrfile_count} files/{state.wrrec_count} '
                    f'records of {entry}')
                stats.append(
                    SqCoalesceStats(
                        entry, period, int(end - start), state.wrfile_count,
                        state.wrrec_count,
                        int(datetime.now(tz=timezone.utc).timestamp() * 1000)))
            except Exception:
                self.logger.exception(f'Unable to coalesce table {entry}')
                stats.append(
                    SqCoalesceStats(
                        entry, period, int(end - start), 0, 0,
                        int(datetime.now(tz=timezone.utc).timestamp() * 1000)))

        return stats
Beispiel #10
0
    def get_valid_df(self, table, **kwargs) -> pd.DataFrame:
        if not self.ctxt.engine:
            print("Specify an analysis engine using set engine command")
            return pd.DataFrame(columns=["namespace", "hostname"])

        sch = SchemaForTable(table, schema=self.schemas)
        phy_table = sch.get_phy_table_for_table()

        columns = kwargs.pop('columns', ['default'])
        addnl_fields = kwargs.pop('addnl_fields', [])
        view = kwargs.pop('view', self.iobj.view)
        active_only = kwargs.pop('active_only', True)

        fields = sch.get_display_fields(columns)
        key_fields = sch.key_fields()
        drop_cols = []

        if 'timestamp' not in fields:
            fields.append('timestamp')

        if 'active' not in fields + addnl_fields:
            addnl_fields.append('active')
            drop_cols.append('active')

        for fld in key_fields:
            if fld not in fields + addnl_fields:
                addnl_fields.insert(0, fld)
                drop_cols.append(fld)

        for f in addnl_fields:
            if f not in fields:
                # timestamp is always the last field
                fields.insert(-1, f)

        for dt in [self.iobj.start_time, self.iobj.end_time]:
            if dt:
                try:
                    parse(dt)
                except (ValueError, ParserError) as e:
                    print(f"invalid time {dt}: {e}")
                    return pd.DataFrame()

        table_df = self.ctxt.engine.get_table_df(
            self.cfg,
            table=phy_table,
            start_time=self.iobj.start_time,
            end_time=self.iobj.end_time,
            columns=fields,
            view=view,
            key_fields=key_fields,
            **kwargs)

        if not table_df.empty:
            if view == 'latest' and active_only:
                table_df = table_df.query('active') \
                                   .drop(columns=drop_cols)
            else:
                table_df.drop(columns=drop_cols, inplace=True)
            if 'timestamp' in table_df.columns:
                table_df['timestamp'] = pd.to_datetime(
                    table_df.timestamp.astype(str), unit="ms")

        return table_df
Beispiel #11
0
    def _get_combined_df(self, **kwargs):
        """OSPF has info divided across multiple tables. Get a single one"""

        columns = kwargs.pop('columns', ['default'])
        state = kwargs.pop('state', '')
        addnl_fields = kwargs.pop('addnl_fields', self.iobj._addnl_fields)
        addnl_nbr_fields = self.iobj._addnl_nbr_fields

        cols = SchemaForTable('ospf', schema=self.schemas) \
            .get_display_fields(columns)
        if columns == ['default']:
            cols.append('timestamp')

        ifschema = SchemaForTable('ospfIf', schema=self.schemas)
        nbrschema = SchemaForTable('ospfNbr', schema=self.schemas)

        if (columns != ['default']) and (columns != ['*']):
            ifkeys = ifschema.key_fields()
            nbrkeys = nbrschema.key_fields()
            if_flds = ifschema.fields
            nbr_flds = nbrschema.fields

            ifcols = ifkeys
            nbrcols = nbrkeys
            for fld in columns:
                if fld in if_flds and fld not in ifcols:
                    ifcols.append(fld)
                elif fld in nbr_flds and fld not in nbrcols:
                    nbrcols.append(fld)
        else:
            ifcols = ifschema.get_display_fields(columns)
            nbrcols = nbrschema.get_display_fields(columns)

        if state == "full":
            query_str = 'adjState == "full" or adjState == "passive"'
        elif state == "other":
            query_str = 'adjState != "full" and adjState != "passive"'
        elif state == "passive":
            query_str = 'adjState == "passive"'
        else:
            query_str = ''

        df = self.get_valid_df('ospfIf',
                               addnl_fields=addnl_fields,
                               columns=ifcols,
                               **kwargs)
        nbr_df = self.get_valid_df('ospfNbr',
                                   addnl_fields=addnl_nbr_fields,
                                   columns=nbrcols,
                                   **kwargs)
        if nbr_df.empty:
            return nbr_df

        merge_cols = [
            x for x in ['namespace', 'hostname', 'ifname']
            if x in nbr_df.columns
        ]
        # Merge the two tables
        df = df.merge(nbr_df, on=merge_cols, how='left')

        if columns == ['*']:
            df = df.drop(columns=['area_y', 'instance_y', 'vrf_y',
                                  'areaStub_y', 'timestamp_y']) \
                .rename(columns={
                    'instance_x': 'instance', 'areaStub_x': 'areaStub',
                    'area_x': 'area', 'vrf_x': 'vrf',
                    'state_x': 'ifState', 'state_y': 'adjState',
                    'sqvers_x': 'sqvers', 'active_x': 'active',
                    'timestamp_x': 'timestamp'})
        else:
            df = df.rename(
                columns={
                    'vrf_x': 'vrf',
                    'area_x': 'area',
                    'state_x': 'ifState',
                    'state_y': 'adjState',
                    'timestamp_x': 'timestamp'
                })
            df = df.drop(list(df.filter(regex='_y$')), axis=1) \
                .fillna({'peerIP': '-', 'numChanges': 0,
                         'lastChangeTime': 0})

        # Fill the adjState column with passive if passive
        if 'passive' in df.columns:
            df.loc[df['adjState'].isnull(), 'adjState'] = df['passive']
            df.loc[df['adjState'].eq(True), 'adjState'] = 'passive'
            df.loc[df['adjState'].eq(False), 'adjState'] = 'fail'
            df.drop(columns=['passive'], inplace=True)

        df.bfill(axis=0, inplace=True)

        # Move the timestamp column to the end
        if query_str:
            return df.query(query_str)[cols]
        return df[cols]
Beispiel #12
0
    def _get_combined_df(self, **kwargs):
        """OSPF has info divided across multiple tables. Get a single one"""

        columns = kwargs.pop('columns', ['default'])
        state = kwargs.pop('state', '')
        addnl_fields = kwargs.pop('addnl_fields', self.iobj._addnl_fields)
        addnl_nbr_fields = self.iobj._addnl_nbr_fields
        user_query = kwargs.pop('query_str', '')

        cols = SchemaForTable('ospf', schema=self.schemas) \
            .get_display_fields(columns)
        if columns == ['default']:
            cols.append('timestamp')

        ifschema = SchemaForTable('ospfIf', schema=self.schemas)
        nbrschema = SchemaForTable('ospfNbr', schema=self.schemas)

        if (columns != ['default']) and (columns != ['*']):
            ifkeys = ifschema.key_fields()
            nbrkeys = nbrschema.key_fields()
            if_flds = ifschema.fields
            nbr_flds = nbrschema.fields

            ifcols = ifkeys
            nbrcols = nbrkeys
            for fld in columns:
                if fld in if_flds and fld not in ifcols:
                    ifcols.append(fld)
                elif fld in nbr_flds and fld not in nbrcols:
                    nbrcols.append(fld)
        else:
            ifcols = ifschema.get_display_fields(columns)
            nbrcols = nbrschema.get_display_fields(columns)

        if state == "full":
            query_str = 'adjState == "full" or adjState == "passive"'
        elif state == "other":
            query_str = 'adjState != "full" and adjState != "passive"'
        elif state == "passive":
            query_str = 'adjState == "passive"'
        else:
            query_str = ''

        df = self.get_valid_df('ospfIf',
                               addnl_fields=addnl_fields,
                               columns=ifcols,
                               **kwargs)
        nbr_df = self.get_valid_df('ospfNbr',
                                   addnl_fields=addnl_nbr_fields,
                                   columns=nbrcols,
                                   **kwargs)
        if nbr_df.empty:
            return df

        merge_cols = [
            x for x in ['namespace', 'hostname', 'ifname']
            if x in nbr_df.columns
        ]
        # Merge the two tables
        df = df.merge(nbr_df, on=merge_cols, how='left')

        # This is because some NOS have the ipAddress in nbr table and some in
        # interface table. Nbr table wins over interface table if present
        if 'ipAddress_y' in df:
            df['ipAddress'] = np.where(df['ipAddress_y'] == "",
                                       df['ipAddress_x'], df['ipAddress_y'])
            df['ipAddress'] = np.where(df['ipAddress'], df['ipAddress'],
                                       df['ipAddress_x'])

        if columns == ['*']:
            df = df.drop(columns=['area_y', 'instance_y', 'vrf_y',
                                  'ipAddress_x', 'ipAddress_y', 'areaStub_y',
                                  'timestamp_y'], errors='ignore') \
                .rename(columns={
                    'instance_x': 'instance', 'areaStub_x': 'areaStub',
                    'area_x': 'area', 'vrf_x': 'vrf',
                    'state_x': 'ifState', 'state_y': 'adjState',
                    'sqvers_x': 'sqvers', 'active_x': 'active',
                    'timestamp_x': 'timestamp'})
        else:
            df = df.rename(
                columns={
                    'vrf_x': 'vrf',
                    'area_x': 'area',
                    'state_x': 'ifState',
                    'state_y': 'adjState',
                    'timestamp_x': 'timestamp'
                })
            df = df.drop(list(df.filter(regex='_y$')), axis=1) \
                   .drop('ipAddress_x', axis=1, errors='ignore') \
                   .fillna({'peerIP': '-', 'numChanges': 0,
                            'lastChangeTime': 0})

        # Fill the adjState column with passive if passive
        if 'passive' in df.columns:
            df.loc[df['adjState'].isnull(), 'adjState'] = df['passive']
            df.loc[df['adjState'].eq(True), 'adjState'] = 'passive'
            df.loc[df['adjState'].eq(False), 'adjState'] = 'fail'
            df.drop(columns=['passive'], inplace=True)

        df.bfill(axis=0, inplace=True)

        if 'peerHostname' in columns or (columns in [['*'], ['default']]):
            nfdf = df.query('adjState != "full"').reset_index()
            nfdf['peerHostname'] = ''
            newdf = df.query('adjState == "full"').reset_index() \
                .drop('peerHostname', axis=1, errors='ignore')
            if not newdf.empty:
                newdf['matchIP'] = newdf.ipAddress.str.split('/').str[0]
                newdf = newdf.merge(newdf[['namespace', 'hostname', 'vrf',
                                           'matchIP']],
                                    left_on=['namespace', 'vrf', 'peerIP'],
                                    right_on=['namespace', 'vrf', 'matchIP'],
                                    suffixes=["", "_y"]) \
                    .rename(columns={'hostname_y': 'peerHostname'}) \
                    .drop_duplicates(subset=['namespace', 'hostname',
                                             'vrf', 'ifname']) \
                    .drop(columns=['matchIP', 'matchIP_y'], errors='ignore')

                if newdf.empty:
                    newdf = df.query('adjState == "full"').reset_index()
                    newdf['peerHostname'] = ''
                final_df = pd.concat([nfdf, newdf])
            else:
                final_df = df
        else:
            final_df = df

        if query_str:
            final_df = final_df.query(query_str).reset_index(drop=True)

        if user_query and not final_df.empty:
            final_df = self._handle_user_query_str(final_df, user_query)
        # Move the timestamp column to the end
        return final_df[cols]
Beispiel #13
0
def test_transform(input_file):
    to_transform = Yaml2Class(input_file)

    try:
        data_directory = to_transform.transform.data_directory
    except AttributeError:
        print('Invalid transformation file, no data directory')
        pytest.fail('AttributeError', pytrace=True)

    #  Make a copy of the data directory
    temp_dir, tmpfile = _coalescer_init(data_directory)

    cfg = load_sq_config(config_file=tmpfile.name)
    schemas = Schema(cfg['schema-directory'])

    for ele in to_transform.transform.transform:
        query_str_list = []
        # Each transformation has a record => write's happen per record
        for record in ele.record:
            changed_fields = set()
            new_df = pd.DataFrame()
            tables = [x for x in dir(record) if not x.startswith('_')]
            for table in tables:
                # Lets read the data in now that we know the table
                tblobj = get_sqobject(table)
                pq_db = get_sqdb_engine(cfg, table, None, None)
                columns = schemas.fields_for_table(table)
                mod_df = tblobj(config_file=tmpfile.name).get(columns=columns)

                for key in getattr(record, table):
                    query_str = key.match
                    chg_df = pd.DataFrame()
                    if query_str != "all":
                        try:
                            chg_df = mod_df.query(query_str) \
                                           .reset_index(drop=True)
                        except Exception as ex:
                            assert (not ex)
                        query_str_list.append(query_str)
                    else:
                        chg_df = mod_df

                    _process_transform_set(key.set, chg_df, changed_fields)
                    if new_df.empty:
                        new_df = chg_df
                    elif not chg_df.empty:
                        new_df = pd.concat([new_df, chg_df])

                if new_df.empty:
                    continue

                # Write the records now
                _write_verify_transform(new_df, table, pq_db,
                                        SchemaForTable(table,
                                                       schemas), tmpfile.name,
                                        query_str_list, changed_fields)

    # Now we coalesce and verify it works
    from suzieq.sqobjects.tables import TablesObj

    pre_table_df = TablesObj(config_file=tmpfile.name).get()
    do_coalesce(cfg, None)
    _verify_coalescing(temp_dir)

    post_table_df = TablesObj(config_file=tmpfile.name).get()
    assert_df_equal(pre_table_df, post_table_df, None)

    # Run additional tests on the coalesced data
    for ele in to_transform.transform.verify:
        table = [x for x in dir(ele) if not x.startswith('_')][0]
        tblobj = get_sqobject(table)

        for tst in getattr(ele, table):
            start_time = tst.test.get('start-time', '')
            end_time = tst.test.get('end-time', '')

            columns = tst.test.get('columns', ['default'])
            df = tblobj(config_file=tmpfile.name,
                        start_time=start_time,
                        end_time=end_time).get(columns=columns)
            if not df.empty and 'query' in tst.test:
                query_str = tst.test['query']
                df = df.query(query_str).reset_index(drop=True)

            if 'assertempty' in tst.test:
                assert (df.empty)
            elif 'shape' in tst.test:
                shape = tst.test['shape'].split()
                if shape[0] != '*':
                    assert (int(shape[0]) == df.shape[0])
                if shape[1] != '*':
                    assert (int(shape[1]) == df.shape[1])
            else:
                assert (not df.empty)

    _coalescer_cleanup(temp_dir, tmpfile)
Beispiel #14
0
    def __init__(self,
                 engine_name: str = '',
                 hostname: typing.List[str] = [],
                 start_time: str = '',
                 end_time: str = '',
                 view: str = 'latest',
                 namespace: typing.List[str] = [],
                 columns: typing.List[str] = ['default'],
                 context=None,
                 table: str = '',
                 config_file=None) -> None:

        if context is None:
            self.ctxt = SqContext(engine_name, config_file)
        else:
            self.ctxt = context
            if not self.ctxt:
                self.ctxt = SqContext(engine_name)

        self._cfg = self.ctxt.cfg
        self._schema = SchemaForTable(table, self.ctxt.schemas)
        self._table = table
        self._sort_fields = self._schema.key_fields()

        if not namespace and self.ctxt.namespace:
            self.namespace = self.ctxt.namespace
        else:
            self.namespace = namespace
        if not hostname and self.ctxt.hostname:
            self.hostname = self.ctxt.hostname
        else:
            self.hostname = hostname

        if not start_time and self.ctxt.start_time:
            self.start_time = self.ctxt.start_time
        else:
            self.start_time = start_time

        if not end_time and self.ctxt.end_time:
            self.end_time = self.ctxt.end_time
        else:
            self.end_time = end_time

        if not view and self.ctxt.view:
            self.view = self.ctxt.view
        else:
            self.view = view
        self.columns = columns

        if engine_name and engine_name != '':
            self.engine = get_sqengine(engine_name)
        else:
            self.engine = self.ctxt.engine

        if self._table:
            self.engine_obj = self.engine.get_object(self._table, self)
        else:
            self.engine_obj = None

        self._addnl_filter = None
        self._addnl_fields = []
Beispiel #15
0
    def get(self, **kwargs):
        """Replacing the original interface name in returned result"""

        addnl_fields = kwargs.pop('addnl_fields', [])
        columns = kwargs.get('columns', ['default'])
        vrf = kwargs.pop('vrf', None)
        peer = kwargs.pop('peer', None)
        hostname = kwargs.pop('hostname', None)
        user_query = kwargs.pop('query_str', None)

        drop_cols = ['origPeer', 'peerHost']
        addnl_fields.extend(['origPeer'])
        sch = SchemaForTable(self.iobj.table, self.schemas)
        fields = sch.get_display_fields(columns)

        for col in [
                'peerIP', 'updateSource', 'state', 'namespace', 'vrf', 'peer',
                'hostname'
        ]:
            if col not in fields:
                addnl_fields.append(col)
                drop_cols.append(col)

        try:
            df = super().get(addnl_fields=addnl_fields, **kwargs)
        except KeyError as ex:
            if ('afi' in str(ex)) or ('safi' in str(ex)):
                df = pd.DataFrame({
                    'error':
                    [f'ERROR: Migrate BGP data first using sq-coalescer']
                })
                return df

        if df.empty:
            return df

        if 'afiSafi' in columns or (columns == ['*']):
            df['afiSafi'] = df['afi'] + ' ' + df['safi']
        query_str = build_query_str([],
                                    sch,
                                    vrf=vrf,
                                    peer=peer,
                                    hostname=hostname)
        if 'peer' in df.columns:
            df['peer'] = np.where(df['origPeer'] != "", df['origPeer'],
                                  df['peer'])

        # Convert old data into new 2.0 data format
        if 'peerHostname' in df.columns:
            mdf = self._get_peer_matched_df(df)
            drop_cols = [x for x in drop_cols if x in mdf.columns]
            drop_cols.extend(list(mdf.filter(regex='_y')))
        else:
            mdf = df

        mdf = self._handle_user_query_str(mdf, user_query)

        if query_str:
            return mdf.query(query_str).drop(columns=drop_cols,
                                             errors='ignore')
        else:
            return mdf.drop(columns=drop_cols, errors='ignore')
Beispiel #16
0
    if 'norifcnReason' in df.columns:
        df.rename({'notifcnReason': 'notificnReason'}, inplace=True)

    pq.write_to_dataset(
        table,
        root_path=output_dir,
        partition_cols=partition_cols,
        version="2.0",
        compression='ZSTD',
        row_group_size=100000,
    )

    logger.info(f'Wrote converted {input_dir}')


if __name__ == "__main__":
    if len(sys.argv) < 4:
        print('Usage: convert_parquet <input dir> <output_dir> <schema_dir>')
        sys.exit(1)

    input_dir = Path(sys.argv[1])
    output_dir = sys.argv[2]
    schemas = Schema(sys.argv[3])
    service = input_dir.parts[-1]
    svc_schema = SchemaForTable(service, schema=schemas)

    logging.basicConfig(stream=sys.stdout, level=logging.WARNING)
    logger = logging.getLogger('sq-converter')
    convert_dir(input_dir, output_dir, svc_schema)
Beispiel #17
0
    def aver(self, **kwargs):
        """Assert that the OSPF state is OK"""

        kwargs.pop('columns', [])
        columns = [
            "namespace",
            "hostname",
            "vrf",
            "ifname",
            "routerId",
            "helloTime",
            "deadTime",
            "passive",
            "ipAddress",
            "isUnnumbered",
            "areaStub",
            "networkType",
            "timestamp",
            "area",
            "nbrCount",
        ]

        # we have to not filter hostname at this point because we need to
        #   understand neighbor relationships
        orig_hostname = kwargs.pop('hostname', '')

        ospf_df = self.get_valid_df("ospfIf", columns=columns, **kwargs)
        if ospf_df.empty:
            return pd.DataFrame(columns=columns)

        ospf_df["assertReason"] = [[] for _ in range(len(ospf_df))]
        df = (ospf_df[ospf_df["routerId"] != ""].groupby(
            ["routerId", "namespace"], as_index=False)[[
                "hostname", "namespace"
            ]].agg(lambda x: x.unique().tolist())).dropna(how='any')

        # df is a dataframe with each row containing the routerId and the
        # corresponding list of hostnames with that routerId. In a good
        # configuration, the list must have exactly one entry
        ospf_df['assertReason'] = (ospf_df.merge(
            df, on=["routerId"], how="outer").apply(
                lambda x: ["duplicate routerId {}".format(x["hostname_y"])]
                if len(x['hostname_y']) != 1 else [],
                axis=1))

        # Now  peering match
        lldpobj = LldpObj(context=self.ctxt)
        lldp_df = lldpobj.get(namespace=kwargs.get("namespace", ""),
                              hostname=kwargs.get("hostname", ""),
                              ifname=kwargs.get("ifname", ""),
                              columns=[
                                  "namespace", "hostname", "ifname",
                                  "peerHostname", "peerIfname", "peerMacaddr"
                              ])
        if lldp_df.empty:
            ospf_df = ospf_df[~(ospf_df.ifname.str.contains('loopback')
                                | ospf_df.ifname.str.contains('Vlan'))]
            ospf_df['assertReason'] = 'No LLDP peering info'
            ospf_df['assert'] = 'fail'
            return ospf_df[[
                'namespace', 'hostname', 'vrf', 'ifname', 'assertReason',
                'assert'
            ]]

        # Create a single massive DF with fields populated appropriately
        use_cols = [
            "namespace",
            "routerId",
            "hostname",
            "vrf",
            "ifname",
            "helloTime",
            "deadTime",
            "passive",
            "ipAddress",
            "areaStub",
            "isUnnumbered",
            "networkType",
            "area",
            "timestamp",
        ]

        int_df = ospf_df[use_cols].merge(lldp_df,
                                         on=["namespace", "hostname",
                                             "ifname"]) \
            .dropna(how="any")

        # filter by hostname now
        if orig_hostname:
            ospfschema = SchemaForTable('ospf', schema=self.schemas)
            hq = build_query_str([], ospfschema, hostname=orig_hostname)
            ospf_df = ospf_df.query(hq)

        if int_df.empty:
            # Weed out the loopback and SVI interfaces as they have no LLDP peers
            ospf_df = ospf_df[~(ospf_df.ifname.str.contains('loopback')
                                | ospf_df.ifname.str.contains('Vlan'))]
            ospf_df['assertReason'] = 'No LLDP peering info'
            ospf_df['assert'] = 'fail'
            return ospf_df[[
                'namespace', 'hostname', 'vrf', 'ifname', 'assertReason',
                'assert'
            ]]

        ospf_df = ospf_df.merge(int_df,
                                left_on=["namespace", "hostname", "ifname"],
                                right_on=["namespace", "peerHostname",
                                          "peerIfname"]) \
            .dropna(how="any")

        # Now start comparing the various parameters
        ospf_df["assertReason"] += ospf_df.apply(
            lambda x: ["subnet mismatch"]
            if ((x["isUnnumbered_x"] != x["isUnnumbered_y"]) and
                (IPv4Network(x["ipAddress_x"], strict=False) != IPv4Network(
                    x["ipAddress_y"], strict=False))) else [],
            axis=1,
        )
        ospf_df["assertReason"] += ospf_df.apply(
            lambda x: ["area mismatch"] if (x["area_x"] != x["area_y"] and x[
                "areaStub_x"] != x["areaStub_y"]) else [],
            axis=1,
        )
        ospf_df["assertReason"] += ospf_df.apply(
            lambda x: ["Hello timers mismatch"]
            if x["helloTime_x"] != x["helloTime_y"] else [],
            axis=1,
        )
        ospf_df["assertReason"] += ospf_df.apply(
            lambda x: ["Dead timer mismatch"]
            if x["deadTime_x"] != x["deadTime_y"] else [],
            axis=1,
        )
        ospf_df["assertReason"] += ospf_df.apply(
            lambda x: ["network type mismatch"]
            if x["networkType_x"] != x["networkType_y"] else [],
            axis=1,
        )
        ospf_df["assertReason"] += ospf_df.apply(
            lambda x: ["passive config mismatch"]
            if x["passive_x"] != x["passive_y"] else [],
            axis=1,
        )
        ospf_df["assertReason"] += ospf_df.apply(
            lambda x: ["vrf mismatch"] if x["vrf_x"] != x["vrf_y"] else [],
            axis=1,
        )

        # Fill up a single assert column now indicating pass/fail
        ospf_df['assert'] = ospf_df.apply(
            lambda x: 'pass' if not len(x['assertReason']) else 'fail', axis=1)

        return (ospf_df.rename(
            index=str,
            columns={
                "hostname_x": "hostname",
                "ifname_x": "ifname",
                "vrf_x": "vrf",
            },
        )[[
            "namespace", "hostname", "ifname", "vrf", "assert", "assertReason",
            "timestamp"
        ]].explode(column='assertReason').fillna({'assertReason': '-'}))
Beispiel #18
0
async def init_services(svc_dir: str, schema_dir: str, queue, svclist: list,
                        def_interval: int, run_once: str):
    """Process service definitions by reading each file in svc dir"""

    svcs_list = []
    schemas = defaultdict(dict)

    # Load up all the service definitions we can find
    svc_classes = {}
    for i in walk_packages(path=[dirname(getfile(Service))]):
        for mbr in getmembers(
                importlib.import_module('suzieq.poller.services.' + i.name),
                isclass):
            if mbr[0] == "Service" or not mbr[0].endswith("Service"):
                continue
            svc_classes[i.name] = mbr[1]
            svc_classes[mbr[0]] = mbr[1]

    if not isdir(svc_dir):
        logger.error("services directory not a directory: {}".format(svc_dir))
        return svcs_list

    if not isdir(schema_dir):
        logger.error("schema directory not a directory: {}".format(svc_dir))
        return svcs_list
    else:
        schemas = Schema(schema_dir)

    if schemas:
        poller_schema = schemas.get_arrow_schema("sqPoller")
        poller_schema_version = SchemaForTable('sqPoller', schemas).version

    for root, _, filenames in walk(svc_dir):
        for filename in filenames:
            if filename.endswith(".yml"):
                with open(root + "/" + filename, "r") as f:
                    svc_def = yaml.safe_load(f.read())
                if svc_def.get('service') not in svclist:
                    logger.warning(
                        f'Ignoring unspecified service {svc_def.get("service")}'
                    )
                    continue

                if "service" not in svc_def or "apply" not in svc_def:
                    logger.error('Ignoring invalid service file definition. \
                    Need both "service" and "apply" keywords: {}'.format(
                        filename))
                    continue

                period = svc_def.get("period", def_interval)
                for elem, val in svc_def["apply"].items():
                    if "copy" in val:
                        newval = svc_def["apply"].get(val["copy"], None)
                        if not newval:
                            logger.error("No device type {} to copy from for "
                                         "{} for service {}".format(
                                             val["copy"], elem,
                                             svc_def["service"]))
                            continue
                        val = newval

                    if (("command" not in val) or
                        ((isinstance(val['command'], list)
                          and not all('textfsm' in x or 'normalize' in x
                                      for x in val['command'])) or
                         (not isinstance(val['command'], list) and
                          ("normalize" not in val and "textfsm" not in val)))):
                        logger.error(
                            "Ignoring invalid service file "
                            'definition. Need both "command" and '
                            '"normalize/textfsm" keywords: {}, {}'.format(
                                filename, val))
                        continue

                    if "textfsm" in val:
                        # We may have already visited this element and parsed
                        # the textfsm file. Check for this
                        if val["textfsm"] and isinstance(
                                val["textfsm"], textfsm.TextFSM):
                            continue
                        tfsm_file = svc_dir + "/" + val["textfsm"]
                        if not isfile(tfsm_file):
                            logger.error("Textfsm file {} not found. Ignoring"
                                         " service".format(tfsm_file))
                            continue
                        with open(tfsm_file, "r") as f:
                            tfsm_template = textfsm.TextFSM(f)
                            val["textfsm"] = tfsm_template
                    elif (isinstance(val['command'], list)):
                        for subelem in val['command']:
                            if 'textfsm' in subelem:
                                if subelem["textfsm"] and isinstance(
                                        subelem["textfsm"], textfsm.TextFSM):
                                    continue
                                tfsm_file = svc_dir + "/" + subelem["textfsm"]
                                if not isfile(tfsm_file):
                                    logger.error(
                                        "Textfsm file {} not found. Ignoring"
                                        " service".format(tfsm_file))
                                    continue
                                with open(tfsm_file, "r") as f:
                                    tfsm_template = textfsm.TextFSM(f)
                                    subelem["textfsm"] = tfsm_template
                    else:
                        tfsm_template = None

                try:
                    schema = SchemaForTable(svc_def['service'], schema=schemas)
                except Exception:
                    logger.error(
                        f"No matching schema for {svc_def['service']}")
                    continue

                if schema.type == "derivedRecord":
                    # These are not real services and so ignore them
                    continue

                # Valid service definition, add it to list
                if svc_def["service"] in svc_classes:
                    service = svc_classes[svc_def["service"]](
                        svc_def["service"],
                        svc_def["apply"],
                        period,
                        svc_def.get("type", "state"),
                        svc_def.get("keys", []),
                        svc_def.get("ignore-fields", []),
                        schema,
                        queue,
                        run_once,
                    )
                else:
                    service = Service(svc_def["service"], svc_def["apply"],
                                      period, svc_def.get("type", "state"),
                                      svc_def.get("keys", []),
                                      svc_def.get("ignore-fields",
                                                  []), schema, queue, run_once)

                service.poller_schema = poller_schema
                service.poller_schema_version = poller_schema_version
                logger.info("Service {} added".format(service.name))
                svcs_list.append(service)

    return svcs_list
Beispiel #19
0
    def get_valid_df(self, table, **kwargs) -> pd.DataFrame:
        if not self.ctxt.engine:
            print("Specify an analysis engine using set engine command")
            return pd.DataFrame(columns=["namespace", "hostname"])

        sch = SchemaForTable(table, schema=self.schemas)
        phy_table = sch.get_phy_table_for_table()

        columns = kwargs.pop('columns', ['default'])
        addnl_fields = kwargs.pop('addnl_fields', [])
        view = kwargs.pop('view', self.iobj.view)
        active_only = kwargs.pop('active_only', True)
        query_str = kwargs.pop('query_str', '')

        # The REST API provides the query_str enclosed in ". Strip that
        if query_str:
            if query_str.startswith('"') and query_str.endswith('"'):
                query_str = query_str[1:-1]

        fields = sch.get_display_fields(columns)
        key_fields = sch.key_fields()
        drop_cols = []

        if columns == ['*']:
            drop_cols.append('sqvers')

        if 'timestamp' not in fields:
            fields.append('timestamp')

        if 'active' not in fields+addnl_fields:
            addnl_fields.append('active')
            drop_cols.append('active')

        for fld in key_fields:
            if fld not in fields+addnl_fields:
                addnl_fields.insert(0, fld)
                drop_cols.append(fld)

        for f in addnl_fields:
            if f not in fields:
                # timestamp is always the last field
                fields.insert(-1, f)

        if self.iobj.start_time:
            try:
                start_time = dateparser.parse(
                    self.iobj.start_time.replace('last night', 'yesterday')) \
                    .timestamp()*1000
            except Exception as e:
                print(f"ERROR: invalid time {self.iobj.start_time}: {e}")
                return pd.DataFrame()
        else:
            start_time = ''

        if self.iobj.start_time and not start_time:
            # Something went wrong with our parsing
            print(f"ERROR: unable to parse {self.iobj.start_time}")
            return pd.DataFrame()

        if self.iobj.end_time:
            try:
                end_time = dateparser.parse(
                    self.iobj.end_time.replace('last night', 'yesterday')) \
                    .timestamp()*1000
            except Exception as e:
                print(f"ERROR: invalid time {self.iobj.end_time}: {e}")
                return pd.DataFrame()
        else:
            end_time = ''

        if self.iobj.end_time and not end_time:
            # Something went wrong with our parsing
            print(f"ERROR: Unable to parse {self.iobj.end_time}")
            return pd.DataFrame()

        table_df = self._dbeng.read(
            phy_table,
            'pandas',
            start_time=start_time,
            end_time=end_time,
            columns=fields,
            view=view,
            key_fields=key_fields,
            **kwargs
        )

        if not table_df.empty:
            if view == 'latest' and active_only:
                table_df = table_df.query('active') \
                                   .drop(columns=drop_cols)
            else:
                table_df.drop(columns=drop_cols, inplace=True)
            if 'timestamp' in table_df.columns:
                table_df['timestamp'] = humanize_timestamp(
                    table_df.timestamp, self.cfg.get('analyzer', {})
                    .get('timezone', None))

        if query_str:
            return table_df.query(query_str)
        else:
            return table_df
Beispiel #20
0
    def get_table_df(self, cfg, schemas, **kwargs) -> pd.DataFrame:
        """Use Pandas instead of Spark to retrieve the data"""

        MAX_FILECNT_TO_READ_FOLDER = 10000

        self.cfg = cfg

        table = kwargs.pop("table")
        start = kwargs.pop("start_time")
        end = kwargs.pop("end_time")
        view = kwargs.pop("view")
        sort_fields = kwargs.pop("sort_fields")
        ign_key_fields = kwargs.pop("ign_key", [])
        addnl_fields = kwargs.pop("addnl_fields", [])

        for f in ['active', 'timestamp']:
            if f not in addnl_fields:
                addnl_fields.append(f)

        sch = SchemaForTable(table, schema=schemas)
        phy_table = sch.get_phy_table_for_table()

        folder = self._get_table_directory(phy_table)

        # Restrict to a single DC if thats whats asked
        if "namespace" in kwargs:
            v = kwargs["namespace"]
            if v:
                if not isinstance(v, list):
                    folder += "/namespace={}/".format(v)

        fcnt = self.get_filecnt(folder)

        if fcnt == 0:
            return pd.DataFrame()

        # We are going to hard code use_get_files until we have some autoamted testing
        use_get_files = False

        # use_get_files = (
        #    (fcnt > MAX_FILECNT_TO_READ_FOLDER and view == "latest") or
        #    start or end
        # )

        if use_get_files:
            # Switch to more efficient method when there are lotsa files
            # Reduce I/O since that is the worst drag
            key_fields = []
            if len(kwargs.get("namespace", [])) > 1:
                del kwargs["namespace"]
            files = get_latest_files(folder, start, end, view)
        else:
            # ign_key_fields contains key fields that are not partition cols
            key_fields = [i for i in sch.key_fields()
                          if i not in ign_key_fields]
            filters = self.build_pa_filters(start, end, key_fields, **kwargs)

        if "columns" in kwargs:
            columns = kwargs["columns"]
            del kwargs["columns"]
        else:
            columns = ["default"]

        fields = sch.get_display_fields(columns)
        for f in addnl_fields:
            if f not in fields:
                fields.append(f)

        # Create the filter to select only specified columns
        query_str = ""
        prefix = ""
        addnl_filter = kwargs.pop('add_filter', None)
        for f, v in kwargs.items():
            if not v or f in key_fields or f in ["groupby"]:
                continue
            if isinstance(v, str):
                if v.startswith('!'):
                    v = v[1:]
                    op = '!='
                else:
                    op = '=='
                query_str += "{} {}{}'{}' ".format(prefix, f, op, v)
                prefix = "and"
            else:
                query_str += "{} {}=={} ".format(prefix, f, v)
                prefix = "and"

        # Add the ignored fields back to key fields to ensure we
        # do the drop_duplicates correctly below incl reading reqd cols
        key_fields.extend(ign_key_fields)

        # Handle the case where key fields are missing from display fields
        fldset = set(fields)
        kfldset = set(key_fields)
        add_flds = kfldset.difference(fldset)
        if add_flds:
            fields.extend(list(add_flds))

        if addnl_filter:
            # This is for special cases that are specific to an object
            if not query_str:
                query_str = addnl_filter
            else:
                query_str += ' and {}'.format(addnl_filter)

        # Restore the folder to what it needs to be
        folder = self._get_table_directory(phy_table)
        if use_get_files:
            if not query_str:
                query_str = "active == True"

            pdf_list = []
            with Executor(max_workers=8) as exe:
                jobs = [
                    exe.submit(self.read_pq_file, f, fields, query_str)
                    for f in files
                ]
                pdf_list = [job.result() for job in jobs]

            if pdf_list:
                final_df = pd.concat(pdf_list)
            else:
                final_df = pd.DataFrame(columns=fields)

        elif view == "latest":
            if not query_str:
                # Make up a dummy query string to avoid if/then/else
                query_str = "timestamp != 0"

            try:
                final_df = (
                    pa.ParquetDataset(
                        folder, filters=filters or None, validate_schema=False
                    )
                    .read(columns=fields)
                    .to_pandas(split_blocks=True, self_destruct=True)
                    .query(query_str)
                    .drop_duplicates(subset=key_fields, keep="last")
                    .query("active == True")
                )
            except pa.lib.ArrowInvalid:
                return pd.DataFrame(columns=fields)
        else:
            if not query_str:
                # Make up a dummy query string to avoid if/then/else
                query_str = 'timestamp != "0"'

            try:
                final_df = (
                    pa.ParquetDataset(
                        folder, filters=filters or None, validate_schema=False
                    )
                    .read(columns=fields)
                    .to_pandas()
                    .query(query_str)
                )
            except pa.lib.ArrowInvalid:
                return pd.DataFrame(columns=fields)

        if 'active' not in columns:
            final_df.drop(columns=['active'], axis=1, inplace=True)
            fields.remove('active')

        final_df = df_timestamp_to_datetime(final_df)

        if sort_fields and all(x in sort_fields for x in fields):
            return final_df[fields].sort_values(by=sort_fields)
        else:
            return final_df[fields]
Beispiel #21
0
    def get_valid_df(self, table, **kwargs) -> pd.DataFrame:
        if not self.ctxt.engine:
            print("Specify an analysis engine using set engine command")
            return pd.DataFrame(columns=["namespace", "hostname"])

        sch = SchemaForTable(table, schema=self.schemas)
        phy_table = sch.get_phy_table_for_table()

        columns = kwargs.pop('columns', ['default'])
        addnl_fields = kwargs.pop('addnl_fields', [])
        view = kwargs.pop('view', self.iobj.view)
        active_only = kwargs.pop('active_only', True)

        fields = sch.get_display_fields(columns)
        key_fields = sch.key_fields()
        drop_cols = []

        if columns == ['*']:
            drop_cols.append('sqvers')

        aug_fields = sch.get_augmented_fields()

        if 'timestamp' not in fields:
            fields.append('timestamp')

        if 'active' not in fields + addnl_fields:
            addnl_fields.append('active')
            drop_cols.append('active')

        # Order matters. Don't put this before the missing key fields insert
        for f in aug_fields:
            dep_fields = sch.get_parent_fields(f)
            addnl_fields += dep_fields

        for fld in key_fields:
            if fld not in fields + addnl_fields:
                addnl_fields.insert(0, fld)
                drop_cols.append(fld)

        for f in addnl_fields:
            if f not in fields:
                # timestamp is always the last field
                fields.insert(-1, f)

        if self.iobj.start_time:
            try:
                start_time = dateparser.parse(
                    self.iobj.start_time.replace('last night', 'yesterday')) \
                    .timestamp()*1000
            except Exception as e:
                print(f"ERROR: invalid time {self.iobj.start_time}: {e}")
                return pd.DataFrame()
        else:
            start_time = ''

        if self.iobj.start_time and not start_time:
            # Something went wrong with our parsing
            print(f"ERROR: unable to parse {self.iobj.start_time}")
            return pd.DataFrame()

        if self.iobj.end_time:
            try:
                end_time = dateparser.parse(
                    self.iobj.end_time.replace('last night', 'yesterday')) \
                    .timestamp()*1000
            except Exception as e:
                print(f"ERROR: invalid time {self.iobj.end_time}: {e}")
                return pd.DataFrame()
        else:
            end_time = ''

        if self.iobj.end_time and not end_time:
            # Something went wrong with our parsing
            print(f"ERROR: Unable to parse {self.iobj.end_time}")
            return pd.DataFrame()

        table_df = self._dbeng.read(phy_table,
                                    'pandas',
                                    start_time=start_time,
                                    end_time=end_time,
                                    columns=fields,
                                    view=view,
                                    key_fields=key_fields,
                                    **kwargs)

        if not table_df.empty:
            if view == "all" or not active_only:
                table_df.drop(columns=drop_cols, inplace=True)
            else:
                table_df = table_df.query('active') \
                                   .drop(columns=drop_cols)
            if 'timestamp' in table_df.columns and not table_df.empty:
                table_df['timestamp'] = humanize_timestamp(
                    table_df.timestamp,
                    self.cfg.get('analyzer', {}).get('timezone', None))

        return table_df
Beispiel #22
0
    def migrate(self, table_name: str, schema: SchemaForTable) -> None:
        """Migrates the data for the table specified to latest version

        :param table_name: str, The name of the table to migrate
        :param schema: SchemaForTable, the current schema
        :returns: None
        :rtype:
        """

        current_vers = schema.version
        defvals = self._get_default_vals()
        arrow_schema = schema.get_arrow_schema()
        schema_def = dict(zip(arrow_schema.names, arrow_schema.types))

        for sqvers in self._get_avail_sqvers(table_name, True):
            if sqvers != current_vers:
                migrate_rtn = get_migrate_fn(table_name, sqvers, current_vers)
                if migrate_rtn:
                    dataset = self._get_cp_dataset(table_name, True, sqvers,
                                                   'all', '', '')
                    for item in dataset.files:
                        try:
                            namespace = item.split('namespace=')[1] \
                                            .split('/')[0]
                        except IndexError:
                            # Don't convert data not in our template
                            continue

                        df = pd.read_parquet(item)
                        df['sqvers'] = sqvers
                        df['namespace'] = namespace
                        newdf = migrate_rtn(df)

                        cols = newdf.columns
                        # Ensure all fields are present
                        for field in schema_def:
                            if field not in cols:
                                newdf[field] = defvals.get(
                                    schema_def[field], '')

                        newdf.drop(columns=['namespace', 'sqvers'])

                        newitem = item.replace(f'sqvers={sqvers}',
                                               f'sqvers={current_vers}')
                        newdir = os.path.dirname(newitem)
                        if not os.path.exists(newdir):
                            os.makedirs(newdir, exist_ok=True)

                        table = pa.Table.from_pandas(
                            newdf,
                            schema=schema.get_arrow_schema(),
                            preserve_index=False)
                        pq.write_to_dataset(table,
                                            newitem,
                                            version="2.0",
                                            compression="ZSTD",
                                            row_group_size=100000)
                        self.logger.debug(
                            f'Migrated {item} version {sqvers}->{current_vers}'
                        )
                        os.remove(item)

                    rmtree(
                        f'{self._get_table_directory(table_name, True)}/sqvers={sqvers}',
                        ignore_errors=True)
        return
Beispiel #23
0
class SqObject(object):
    def __init__(self,
                 engine_name: str = 'pandas',
                 hostname: typing.List[str] = [],
                 start_time: str = '',
                 end_time: str = '',
                 view: str = 'latest',
                 namespace: typing.List[str] = [],
                 columns: typing.List[str] = ['default'],
                 context=None,
                 table: str = '',
                 config_file=None) -> None:

        if context is None:
            self.ctxt = SqContext(engine_name, config_file)
        else:
            self.ctxt = context
            if not self.ctxt:
                self.ctxt = SqContext(engine_name)

        self._cfg = self.ctxt.cfg
        self._schema = SchemaForTable(table, self.ctxt.schemas)
        self._table = table
        self._sort_fields = self._schema.key_fields()

        if not namespace and self.ctxt.namespace:
            self.namespace = self.ctxt.namespace
        else:
            self.namespace = namespace
        if not hostname and self.ctxt.hostname:
            self.hostname = self.ctxt.hostname
        else:
            self.hostname = hostname

        if not start_time and self.ctxt.start_time:
            self.start_time = self.ctxt.start_time
        else:
            self.start_time = start_time

        if not end_time and self.ctxt.end_time:
            self.end_time = self.ctxt.end_time
        else:
            self.end_time = end_time

        if not view and self.ctxt.view:
            self.view = self.ctxt.view
        else:
            self.view = view
        self.columns = columns

        if engine_name and engine_name != '':
            self.engine = get_sqengine(engine_name, self._table)(self._table,
                                                                 self)
        elif self.ctxt.engine:
            self.engine = get_sqengine(self.ctxt.engine,
                                       self._table)(self._table, self)

        if not self.engine:
            raise ValueError('Unknown analysis engine')

        self._addnl_filter = None
        self._addnl_fields = []
        self._valid_get_args = None
        self._valid_assert_args = None
        self._valid_arg_vals = None

    @property
    def all_schemas(self):
        return self.ctxt.schemas

    @property
    def schema(self):
        return self._schema

    @property
    def cfg(self):
        return self._cfg

    @property
    def table(self):
        return self._table

    def _check_input_for_valid_args(
        self,
        good_arg_list,
        **kwargs,
    ):
        if not good_arg_list:
            return

        # add standard args that are always
        good_arg_list = good_arg_list + (['namespace', 'addnl_fields'])

        for arg in kwargs.keys():
            if arg not in good_arg_list:
                raise AttributeError(
                    f"argument {arg} not supported for this command")

    def _check_input_for_valid_vals(self, good_arg_val_list, **kwargs):
        '''Check if the input is valid for the arg, if possible'''

        if not good_arg_val_list:
            return

        for arg in kwargs.keys():
            if arg in good_arg_val_list:
                if kwargs[arg] not in good_arg_val_list[arg]:
                    raise AttributeError(
                        f"invalid value {kwargs[arg]} for argument {arg}")

    def validate_get_input(self, **kwargs):
        self._check_input_for_valid_args(self._valid_get_args + ['columns'],
                                         **kwargs)
        self._check_input_for_valid_vals(self._valid_arg_vals, **kwargs)

    def validate_assert_input(self, **kwargs):
        self._check_input_for_valid_args(self._valid_assert_args, **kwargs)

    def get(self, **kwargs) -> pd.DataFrame:

        if not self._table:
            raise NotImplementedError

        if not self.ctxt.engine:
            raise AttributeError('No analysis engine specified')

        if self._addnl_filter:
            kwargs['add_filter'] = self._addnl_filter

        # This raises exceptions if it fails
        try:
            self.validate_get_input(**kwargs)
        except Exception as error:
            df = pd.DataFrame({'error': [f'{error}']})
            return df

        return self.engine.get(**kwargs)

    def summarize(self,
                  namespace=[],
                  hostname=[],
                  query_str='') -> pd.DataFrame:
        if self.columns != ["default"]:
            self.summarize_df = pd.DataFrame({
                'error': ['ERROR: You cannot specify columns with summarize']
            })
            return self.summarize_df
        if not self._table:
            raise NotImplementedError

        if not self.ctxt.engine:
            raise AttributeError('No analysis engine specified')

        return self.engine.summarize(namespace=namespace,
                                     hostname=hostname,
                                     query_str=query_str)

    def unique(self, **kwargs) -> pd.DataFrame:
        if not self._table:
            raise NotImplementedError

        if not self.ctxt.engine:
            raise AttributeError('No analysis engine specified')

        columns = kwargs.pop('columns', self.columns)
        return self.engine.unique(**kwargs, columns=columns)

    def analyze(self, **kwargs):
        raise NotImplementedError

    def aver(self, **kwargs):
        raise NotImplementedError

    def top(self, what='', n=5, reverse=False, **kwargs) -> pd.DataFrame:
        """Get the list of top/bottom entries of "what" field"""

        if "columns" in kwargs:
            columns = kwargs["columns"]
            del kwargs["columns"]
        else:
            columns = ["default"]

        # if self._valid_get_args:
        #     self._valid_get_args += ['what', 'n', 'reverse']
        # This raises exceptions if it fails
        try:
            self.validate_get_input(**kwargs)
        except Exception as error:
            df = pd.DataFrame({'error': [f'{error}']})
            return df

        table_schema = SchemaForTable(self._table, self.all_schemas)
        columns = table_schema.get_display_fields(columns)

        if what not in columns:
            self._addnl_fields.append(what)

        return self.engine.top(what=what, n=n, reverse=reverse, **kwargs)

    def humanize_fields(self, df: pd.DataFrame, subset=None) -> pd.DataFrame:
        '''Humanize the fields for human consumption.

        Individual classes will implement the right transofmations. This
        routine is just a placeholder for all those with nothing to modify.
        '''
        return df
Beispiel #24
0
class SqObject(object):
    def __init__(self,
                 engine_name: str = '',
                 hostname: typing.List[str] = [],
                 start_time: str = '',
                 end_time: str = '',
                 view: str = 'latest',
                 namespace: typing.List[str] = [],
                 columns: typing.List[str] = ['default'],
                 context=None,
                 table: str = '',
                 config_file=None) -> None:

        if context is None:
            self.ctxt = SqContext(engine_name, config_file)
        else:
            self.ctxt = context
            if not self.ctxt:
                self.ctxt = SqContext(engine_name)

        self._cfg = self.ctxt.cfg
        self._schema = SchemaForTable(table, self.ctxt.schemas)
        self._table = table
        self._sort_fields = self._schema.key_fields()

        if not namespace and self.ctxt.namespace:
            self.namespace = self.ctxt.namespace
        else:
            self.namespace = namespace
        if not hostname and self.ctxt.hostname:
            self.hostname = self.ctxt.hostname
        else:
            self.hostname = hostname

        if not start_time and self.ctxt.start_time:
            self.start_time = self.ctxt.start_time
        else:
            self.start_time = start_time

        if not end_time and self.ctxt.end_time:
            self.end_time = self.ctxt.end_time
        else:
            self.end_time = end_time

        if not view and self.ctxt.view:
            self.view = self.ctxt.view
        else:
            self.view = view
        self.columns = columns

        if engine_name and engine_name != '':
            self.engine = get_sqengine(engine_name)
        else:
            self.engine = self.ctxt.engine

        if self._table:
            self.engine_obj = self.engine.get_object(self._table, self)
        else:
            self.engine_obj = None

        self._addnl_filter = None
        self._addnl_fields = []

    @property
    def all_schemas(self):
        return self.ctxt.schemas

    @property
    def schema(self):
        return self._schema

    @property
    def cfg(self):
        return self._cfg

    @property
    def table(self):
        return self._table

    def validate_input(self, **kwargs):
        """Dummy validate input"""
        return

    def get(self, **kwargs) -> pd.DataFrame:
        if not self._table:
            raise NotImplementedError

        if not self.ctxt.engine:
            raise AttributeError('No analysis engine specified')

        if self._addnl_filter:
            kwargs['add_filter'] = self._addnl_filter

        # This raises exceptions if it fails
        try:
            self.validate_input(**kwargs)
        except Exception as error:
            df = pd.DataFrame({'error': [f'{error}']})
            return df

        return self.engine_obj.get(**kwargs)

    def summarize(self, namespace='') -> pd.DataFrame:
        if not self._table:
            raise NotImplementedError

        if not self.ctxt.engine:
            raise AttributeError('No analysis engine specified')

        return self.engine_obj.summarize(namespace=namespace)

    def unique(self, **kwargs) -> pd.DataFrame:
        if not self._table:
            raise NotImplementedError

        if not self.ctxt.engine:
            raise AttributeError('No analysis engine specified')

        return self.engine_obj.unique(**kwargs)

    def analyze(self, **kwargs):
        raise NotImplementedError

    def aver(self, **kwargs):
        raise NotImplementedError

    def top(self, what='', n=5, reverse=False, **kwargs) -> pd.DataFrame:
        """Get the list of top/bottom entries of "what" field"""

        if "columns" in kwargs:
            columns = kwargs["columns"]
            del kwargs["columns"]
        else:
            columns = ["default"]

        table_schema = SchemaForTable(self._table, self.all_schemas)
        columns = table_schema.get_display_fields(columns)

        if what not in columns:
            self._addnl_fields.append(what)

        return self.engine_obj.top(what=what, n=n, reverse=reverse, **kwargs)