Ejemplo n.º 1
0
    def humanize_fields(self, df: pd.DataFrame, _=None) -> pd.DataFrame:
        '''Humanize the timestamp and boot time fields'''
        if df.empty:
            return df

        if 'statusChangeTimestamp' in df.columns:
            df['statusChangeTimestamp'] = humanize_timestamp(
                df.statusChangeTimestamp,
                self.cfg.get('analyzer', {}).get('timezone', None))

        return super().humanize_fields(df)
Ejemplo n.º 2
0
    def summarize(self, **kwargs):
        """Describe the data"""

        # Discard these
        kwargs.pop('columns', None)

        # 'ospfIf' is ignored
        self._init_summarize(**kwargs)
        if self.summary_df.empty:
            return self.summary_df

        self._summarize_on_add_field = [
            ('deviceCnt', 'hostname', 'nunique'),
            ('peerCnt', 'hostname', 'count'),
        ]

        self._summarize_on_add_with_query = [
            ('stubbyPeerCnt', 'areaStub', 'areaStub'),
            ('passivePeerCnt', 'adjState == "passive"', 'ifname'),
            ('unnumberedPeerCnt', 'isUnnumbered', 'isUnnumbered'),
            ('failedPeerCnt', 'adjState != "passive" and nbrCount == 0',
             'ifname'),
        ]

        self._summarize_on_add_list_or_count = [
            ('area', 'area'),
            ('vrf', 'vrf'),
            ('helloTime', 'helloTime'),
            ('deadTime', 'deadTime'),
            ('retxTime', 'retxTime'),
            ('networkType', 'networkType'),
        ]

        self.summary_df['lastChangeTime'] = np.where(
            self.summary_df.lastChangeTime.isnull(), 0,
            self.summary_df.lastChangeTime)

        self.summary_df['lastChangeTime'] = humanize_timestamp(
            self.summary_df.lastChangeTime, self.cfg.get('analyzer', {})
            .get('timezone', None))

        self.summary_df['lastChangeTime'] = (
            self.summary_df['timestamp'] - self.summary_df['lastChangeTime'])
        self.summary_df['lastChangeTime'] = self.summary_df['lastChangeTime'] \
            .apply(lambda x: x.round('s'))

        self._summarize_on_add_stat = [
            ('adjChangesStat', '', 'numChanges'),
            ('upTimeStat', 'adjState == "full"', 'lastChangeTime'),
        ]

        self._gen_summarize_data()
        self._post_summarize()
        return self.ns_df.convert_dtypes()
Ejemplo n.º 3
0
    def humanize_fields(self, df: pd.DataFrame, _=None) -> pd.DataFrame:
        '''Humanize the timestamp fields'''
        if df.empty:
            return df

        if 'lastUpdate' in df.columns:
            df['lastUpdate'] = humanize_timestamp(
                df.lastUpdate,
                self.cfg.get('analyzer', {}).get('timezone', None))

        return super().humanize_fields(df)
Ejemplo n.º 4
0
    def humanize_fields(self, df: pd.DataFrame, _=None) -> pd.DataFrame:
        '''Humanize the fields for human consumption.

        Individual classes will implement the right transofmations. This
        routine is just a placeholder for all those with nothing to modify.
        '''
        if 'timestamp' in df.columns and not df.empty:
            df['timestamp'] = humanize_timestamp(df.timestamp,
                                                 self.cfg.get('analyzer', {})
                                                 .get('timezone', None))

        return df
Ejemplo n.º 5
0
    def humanize_fields(self, df: pd.DataFrame, _=None) -> pd.DataFrame:
        '''Humanize the timestamp and boot time fields'''
        if df.empty:
            return df

        # Convert the bootup timestamp into a time delta
        if 'bootupTimestamp' in df.columns:
            df['bootupTimestamp'] = humanize_timestamp(
                df['bootupTimestamp']*1000,
                self.cfg.get('analyzer', {}).get('timezone', None))

        return super().humanize_fields(df)
Ejemplo n.º 6
0
    def humanize_fields(self, df: pd.DataFrame, _=None) -> pd.DataFrame:
        '''Humanize the timestamp and boot time fields'''
        if df.empty:
            return df

        if 'lastChangeTime' in df.columns:
            df['lastChangeTime'] = humanize_timestamp(
                df.lastChangeTime.fillna(0),
                self.cfg.get('analyzer', {}).get('timezone', None))

            if 'adjState' in df.columns:
                df['lastChangeTime'] = np.where(df.adjState == "passive",
                                                pd.Timestamp(0),
                                                df.lastChangeTime)

        return super().humanize_fields(df)
Ejemplo n.º 7
0
def get_file_timestamps(filelist: List[str]) -> pd.DataFrame:
    """Read the files and construct a dataframe of files and timestamp of
       record in them.

    :param filelist: list, of full path name files, typically from pyarrow's
                     dataset.files
    :returns: dataframe of filename with the time it represents, sorted
    :rtype: pandas.DataFrame

    """
    if not filelist:
        return pd.DataFrame(columns=['file', 'timestamp'])

    # We can't rely on the system istat time to find the times involved
    # So read the data for each block and check. We tried using threading
    # and it didn't dramatically alter the results. Given that we might've
    # too many threads running with the poller and everything, we skipped
    # doing it.
    fname_list = []
    fts_list = []
    for file in filelist:
        try:
            ts = pd.read_parquet(file, columns=['timestamp'])
            fts_list.append(ts.timestamp.min())
            fname_list.append(file)
        except OSError:
            # skip this file because it can't be read, is probably 0 bytes
            logging.debug(f"not reading timestamp for {file}")

    # Construct file dataframe as its simpler to deal with
    if fname_list:
        fdf = pd.DataFrame({'file': fname_list, 'timestamp': fts_list})
        fdf['timestamp'] = humanize_timestamp(fdf.timestamp, 'UTC')
        return fdf.sort_values(by=['timestamp'])

    return pd.DataFrame(['file', 'timestamp'])
Ejemplo n.º 8
0
def _write_verify_transform(mod_df, table, dbeng, schema, config_file,
                            query_str_list, changed_fields):
    """Write and verify that the written data is present

    :param mod_df: pd.DataFrame, the modified dataframe to write
    :param table: str, the name of the table to write
    :param dbeng: SqParquetDB, pointer to DB class to write/read
    :param schema: SchemaForTable, Schema of data to be written
    :param config_file: str, Filename where suzieq config is stored
    :param query_str_list: List[str], query string if any to apply to data for
                           verification check
    :param changed_fields: set, list of changed fields to verify
    :returns: Nothing
    :rtype:

    """
    mod_df = mod_df.reset_index(drop=True)
    mod_df.timestamp = mod_df.timestamp.view(np.int64)
    mod_df.timestamp = mod_df.timestamp // 1000000
    mod_df.sqvers = mod_df.sqvers.astype(str)
    dbeng.write(table, 'pandas', mod_df, False, schema.get_arrow_schema(),
                None)

    # Verify that what we wrote is what we got back
    mod_df.sqvers = mod_df.sqvers.astype(float)

    tblobj = get_sqobject(table)
    post_read_df = tblobj(config_file=config_file).get(columns=schema.fields)

    assert (not post_read_df.empty)
    # If the data was built up as a series of queries, we have to
    # apply the queries to verify that we have what we wrote
    dfconcat = None
    if query_str_list:
        for qstr in query_str_list:
            qdf = post_read_df.query(qstr).reset_index(drop=True)
            assert (not qdf.empty)
            if dfconcat is not None:
                dfconcat = pd.concat([dfconcat, qdf])
            else:
                dfconcat = qdf

    if dfconcat is not None:
        qdf = dfconcat.set_index(schema.key_fields()) \
                      .sort_index()
    else:
        qdf = post_read_df.set_index(schema.key_fields()) \
                          .sort_index()

    mod_df = mod_df.set_index(schema.key_fields()) \
                   .query('~index.duplicated(keep="last")') \
                   .sort_index()

    mod_df.timestamp = humanize_timestamp(mod_df.timestamp, 'GMT')

    # We can't call assert_df_equal directly and so we
    # compare this way. The catch is if we accidentally
    # change some of the unchanged fields
    assert (mod_df.shape == qdf.shape)

    assert (not [
        x for x in mod_df.columns.tolist() if x not in qdf.columns.tolist()
    ])

    assert ((mod_df.index == qdf.index).all())

    assert_df_equal(mod_df[changed_fields].reset_index(),
                    qdf[changed_fields].reset_index(), None)
Ejemplo n.º 9
0
    def get(self, **kwargs):
        """Get the information requested"""
        view = kwargs.get('view', self.iobj.view)
        columns = kwargs.get('columns', ['default'])
        addnl_fields = kwargs.pop('addnl_fields', [])
        user_query = kwargs.pop('query_str', '')
        status = kwargs.pop('status', '')
        os_version = kwargs.pop('version', '')
        vendor = kwargs.get('vendor', '')
        model = kwargs.get('model', '')
        os = kwargs.get('os', '')

        drop_cols = []

        if 'active' not in addnl_fields+columns and columns != ['*']:
            addnl_fields.append('active')
            drop_cols.append('active')

        # os is not included in the default column list. Why? I was dumb
        if (columns == ['default'] and os) or (os and 'os' not in columns):
            addnl_fields.append('os')
            drop_cols.append('os')

        for col in ['namespace', 'hostname', 'status', 'address']:
            if columns not in [['default'], ['*']] and col not in columns:
                addnl_fields.append(col)
                drop_cols.append(col)

        if columns == ['*'] or 'uptime' in columns:
            if columns != ['*'] and 'bootupTimestamp' not in columns:
                addnl_fields.append('bootupTimestamp')
                drop_cols.append('bootupTimestamp')

        df = super().get(active_only=False, addnl_fields=addnl_fields,
                         **kwargs)
        if view == 'latest' and 'status' in df.columns:
            df['status'] = np.where(df.active, df['status'], 'dead')

        poller_df = self._get_table_sqobj('sqPoller').get(
            namespace=kwargs.get('namespace', []),
            hostname=kwargs.get('hostname', []),
            service='device',
            columns='namespace hostname status'.split())

        if not poller_df.empty:
            # Identify the address to namespace/hostname mapping
            addr_dict = {f"{x['namespace']}-{x['address']}": x['hostname']
                         for x in df[['namespace', 'address', 'hostname']]
                         .to_dict(orient='records')}

            poller_df['hostname'] = poller_df.apply(
                lambda x, y: y.get(f"{x['namespace']}-{x['hostname']}",
                                   x['hostname']),
                args=(addr_dict,), axis=1)

            poller_df = poller_df\
                .drop_duplicates(subset=['namespace', 'hostname'],
                                 keep='last') \
                .reset_index(drop=True)

            df = df.merge(poller_df, on=['namespace', 'hostname'],
                          how='outer', suffixes=['', '_y'])  \
                .fillna({'bootupTimestamp': 0, 'timestamp': 0,
                         'active': True}) \
                .fillna('N/A')

            df.status = np.where(
                (df['status_y'] != 0) & (df['status_y'] != 200) &
                (df['status'] == "N/A"),
                'neverpoll', df['status'])
            df = df[df.status != 'N/A']
            df.timestamp = np.where(df['timestamp'] == 0,
                                    df['timestamp_y'], df['timestamp'])
            if 'address' in df.columns:
                df.address = np.where(df['address'] == 'N/A', df['hostname'],
                                      df['address'])

            drop_cols.extend(['status_y', 'timestamp_y'])

            if 'uptime' in columns or columns == ['*']:
                uptime_cols = (df['timestamp'] -
                               humanize_timestamp(df['bootupTimestamp']*1000,
                               self.cfg.get('analyzer', {}).get('timezone',
                                                                None)))
                uptime_cols = pd.to_timedelta(uptime_cols, unit='s')
                df.insert(len(df.columns)-1, 'uptime', uptime_cols)

        if df.empty:
            return df

        # The poller merge kills the filtering we did earlier, so redo:
        if status:
            df = df.loc[df.status.isin(status)]
        if vendor:
            df = df.loc[df.vendor.isin(vendor)]
        if model:
            df = df.loc[df.model.isin(model)]
        if os:
            df = df.loc[df.os.isin(os)]
        if os_version:
            opdict = {'>': operator.gt, '<': operator.lt, '>=': operator.ge,
                      '<=': operator.le, '=': operator.eq, '!=': operator.ne}
            op = operator.eq
            for elem, val in opdict.items():
                if os_version.startswith(elem):
                    os_version = os_version.replace(elem, '')
                    op = val
                    break

            df = df.loc[df.version.apply(
                lambda x: op(version.LegacyVersion(x),
                             version.LegacyVersion(os_version)))]

        df = self._handle_user_query_str(df, user_query)

        # if poller has failed completely, Can mess up the order of columns
        cols = self.iobj.schema.get_display_fields(columns)
        if columns == ['default'] and 'timestamp' not in cols:
            cols.append('timestamp')
        if 'sqvers' in cols:
            cols.remove('sqvers')
        return df.drop(columns=drop_cols, errors='ignore')[cols]
Ejemplo n.º 10
0
    def summarize(self, **kwargs) -> pd.DataFrame:
        """Summarize key information about BGP"""

        self._init_summarize(**kwargs)
        if self.summary_df.empty or ('error' in self.summary_df.columns):
            return self.summary_df

        self.summary_df['afiSafi'] = (self.summary_df['afi'] + ' ' +
                                      self.summary_df['safi'])

        afi_safi_count = self.summary_df.groupby(by=['namespace'])['afiSafi'] \
                                        .nunique()

        self.summary_df = self.summary_df \
                              .set_index(['namespace', 'hostname', 'vrf',
                                          'peer']) \
                              .query('~index.duplicated(keep="last")') \
                              .reset_index()
        self.ns = {i: {} for i in self.summary_df['namespace'].unique()}
        self.nsgrp = self.summary_df.groupby(by=["namespace"], observed=True)

        self._summarize_on_add_field = [('deviceCnt', 'hostname', 'nunique'),
                                        ('totalPeerCnt', 'peer', 'count'),
                                        ('uniqueAsnCnt', 'asn', 'nunique'),
                                        ('uniqueVrfsCnt', 'vrf', 'nunique')]

        self._summarize_on_add_with_query = [
            ('failedPeerCnt', 'state == "NotEstd"', 'peer'),
            ('iBGPPeerCnt', 'asn == peerAsn', 'peer'),
            ('eBGPPeerCnt', 'asn != peerAsn', 'peer'),
            ('rrClientPeerCnt', 'rrclient.str.lower() == "true"', 'peer',
             'count'),
        ]

        self._gen_summarize_data()

        # pylint: disable=expression-not-assigned
        {
            self.ns[i].update({'activeAfiSafiCnt': afi_safi_count[i]})
            for i in self.ns.keys()
        }
        self.summary_row_order.append('activeAfiSafiCnt')

        self.summary_df['estdTime'] = humanize_timestamp(
            self.summary_df.estdTime,
            self.cfg.get('analyzer', {}).get('timezone', None))

        self.summary_df['estdTime'] = (self.summary_df['timestamp'] -
                                       self.summary_df['estdTime'])
        self.summary_df['estdTime'] = self.summary_df['estdTime'] \
                                          .apply(lambda x: x.round('s'))
        # Now come the BGP specific ones
        established = self.summary_df.query("state == 'Established'") \
            .groupby(by=['namespace'])

        uptime = established["estdTime"]
        rx_updates = established["updatesRx"]
        tx_updates = established["updatesTx"]
        self._add_stats_to_summary(uptime, 'upTimeStat')
        self._add_stats_to_summary(rx_updates, 'updatesRxStat')
        self._add_stats_to_summary(tx_updates, 'updatesTxStat')

        self.summary_row_order.extend(
            ['upTimeStat', 'updatesRxStat', 'updatesTxStat'])

        self._post_summarize()
        return self.ns_df.convert_dtypes()
Ejemplo n.º 11
0
    def get_valid_df(self, table: str, **kwargs) -> pd.DataFrame:
        """The heart of the engine: retrieving the data from the backing store

        Args:
            table (str): Name of the table to retrieve the data for

        Returns:
            pd.DataFrame: The data as a pandas dataframe
        """
        if not self.ctxt.engine:
            print("Specify an analysis engine using set engine command")
            return pd.DataFrame(columns=["namespace", "hostname"])

        # Thanks to things like OSPF, we cannot use self.schema here
        sch = SchemaForTable(table, self.all_schemas)
        phy_table = sch.get_phy_table_for_table()

        columns = kwargs.pop('columns', ['default'])
        addnl_fields = kwargs.pop('addnl_fields', [])
        view = kwargs.pop('view', self.iobj.view)
        active_only = kwargs.pop('active_only', True)
        hostname = kwargs.pop('hostname', [])

        fields = sch.get_display_fields(columns)
        key_fields = sch.key_fields()
        drop_cols = []

        if columns == ['*']:
            drop_cols.append('sqvers')

        aug_fields = sch.get_augmented_fields()

        if 'timestamp' not in fields:
            fields.append('timestamp')

        if 'active' not in fields + addnl_fields:
            addnl_fields.append('active')
            if view != 'all':
                drop_cols.append('active')

        # Order matters. Don't put this before the missing key fields insert
        for f in aug_fields:
            dep_fields = sch.get_parent_fields(f)
            addnl_fields += dep_fields

        for fld in key_fields:
            if fld not in fields + addnl_fields:
                addnl_fields.insert(0, fld)
                drop_cols.append(fld)

        for f in addnl_fields:
            if f not in fields:
                # timestamp is always the last field
                fields.insert(-1, f)

        if self.iobj.start_time:
            try:
                start_time = int(
                    dateparser.parse(
                        self.iobj.start_time.replace(
                            'last night', 'yesterday')).timestamp() * 1000)
            except Exception:
                # pylint disable=raise-missing-from
                raise ValueError(
                    f"unable to parse start-time: {self.iobj.start_time}")
        else:
            start_time = ''

        if self.iobj.start_time and not start_time:
            # Something went wrong with our parsing
            # pylint disable=raise-missing-from
            raise ValueError(
                f"unable to parse start-time: {self.iobj.start_time}")

        if self.iobj.end_time:
            try:
                end_time = int(
                    dateparser.parse(
                        self.iobj.end_time.replace(
                            'last night', 'yesterday')).timestamp() * 1000)
            except Exception:
                # pylint disable=raise-missing-from
                raise ValueError(
                    f"unable to parse end-time: {self.iobj.end_time}")
        else:
            end_time = ''

        if self.iobj.end_time and not end_time:
            # Something went wrong with our parsing
            # pylint disable=raise-missing-from
            raise ValueError(f"unable to parse end-time: {self.iobj.end_time}")

        table_df = self._dbeng.read(phy_table,
                                    'pandas',
                                    start_time=start_time,
                                    end_time=end_time,
                                    columns=fields,
                                    view=view,
                                    key_fields=key_fields,
                                    **kwargs)

        if not table_df.empty:
            # hostname may not have been filtered if using regex
            if hostname:
                hdf_list = []
                for hn in hostname:
                    if hn.startswith('~'):
                        hn = hn[1:]
                    df1 = table_df.query(f"hostname.str.match('{hn}')")
                    if not df1.empty:
                        hdf_list.append(df1)

                if hdf_list:
                    table_df = pd.concat(hdf_list)
                else:
                    return pd.DataFrame(columns=table_df.columns.tolist())

            if view == "all" or not active_only:
                table_df.drop(columns=drop_cols, inplace=True)
            else:
                table_df = table_df.query('active') \
                    .drop(columns=drop_cols)
            if 'timestamp' in table_df.columns and not table_df.empty:
                table_df['timestamp'] = humanize_timestamp(
                    table_df.timestamp,
                    self.cfg.get('analyzer', {}).get('timezone', None))

        return table_df