def humanize_fields(self, df: pd.DataFrame, _=None) -> pd.DataFrame: '''Humanize the timestamp and boot time fields''' if df.empty: return df if 'statusChangeTimestamp' in df.columns: df['statusChangeTimestamp'] = humanize_timestamp( df.statusChangeTimestamp, self.cfg.get('analyzer', {}).get('timezone', None)) return super().humanize_fields(df)
def summarize(self, **kwargs): """Describe the data""" # Discard these kwargs.pop('columns', None) # 'ospfIf' is ignored self._init_summarize(**kwargs) if self.summary_df.empty: return self.summary_df self._summarize_on_add_field = [ ('deviceCnt', 'hostname', 'nunique'), ('peerCnt', 'hostname', 'count'), ] self._summarize_on_add_with_query = [ ('stubbyPeerCnt', 'areaStub', 'areaStub'), ('passivePeerCnt', 'adjState == "passive"', 'ifname'), ('unnumberedPeerCnt', 'isUnnumbered', 'isUnnumbered'), ('failedPeerCnt', 'adjState != "passive" and nbrCount == 0', 'ifname'), ] self._summarize_on_add_list_or_count = [ ('area', 'area'), ('vrf', 'vrf'), ('helloTime', 'helloTime'), ('deadTime', 'deadTime'), ('retxTime', 'retxTime'), ('networkType', 'networkType'), ] self.summary_df['lastChangeTime'] = np.where( self.summary_df.lastChangeTime.isnull(), 0, self.summary_df.lastChangeTime) self.summary_df['lastChangeTime'] = humanize_timestamp( self.summary_df.lastChangeTime, self.cfg.get('analyzer', {}) .get('timezone', None)) self.summary_df['lastChangeTime'] = ( self.summary_df['timestamp'] - self.summary_df['lastChangeTime']) self.summary_df['lastChangeTime'] = self.summary_df['lastChangeTime'] \ .apply(lambda x: x.round('s')) self._summarize_on_add_stat = [ ('adjChangesStat', '', 'numChanges'), ('upTimeStat', 'adjState == "full"', 'lastChangeTime'), ] self._gen_summarize_data() self._post_summarize() return self.ns_df.convert_dtypes()
def humanize_fields(self, df: pd.DataFrame, _=None) -> pd.DataFrame: '''Humanize the timestamp fields''' if df.empty: return df if 'lastUpdate' in df.columns: df['lastUpdate'] = humanize_timestamp( df.lastUpdate, self.cfg.get('analyzer', {}).get('timezone', None)) return super().humanize_fields(df)
def humanize_fields(self, df: pd.DataFrame, _=None) -> pd.DataFrame: '''Humanize the fields for human consumption. Individual classes will implement the right transofmations. This routine is just a placeholder for all those with nothing to modify. ''' if 'timestamp' in df.columns and not df.empty: df['timestamp'] = humanize_timestamp(df.timestamp, self.cfg.get('analyzer', {}) .get('timezone', None)) return df
def humanize_fields(self, df: pd.DataFrame, _=None) -> pd.DataFrame: '''Humanize the timestamp and boot time fields''' if df.empty: return df # Convert the bootup timestamp into a time delta if 'bootupTimestamp' in df.columns: df['bootupTimestamp'] = humanize_timestamp( df['bootupTimestamp']*1000, self.cfg.get('analyzer', {}).get('timezone', None)) return super().humanize_fields(df)
def humanize_fields(self, df: pd.DataFrame, _=None) -> pd.DataFrame: '''Humanize the timestamp and boot time fields''' if df.empty: return df if 'lastChangeTime' in df.columns: df['lastChangeTime'] = humanize_timestamp( df.lastChangeTime.fillna(0), self.cfg.get('analyzer', {}).get('timezone', None)) if 'adjState' in df.columns: df['lastChangeTime'] = np.where(df.adjState == "passive", pd.Timestamp(0), df.lastChangeTime) return super().humanize_fields(df)
def get_file_timestamps(filelist: List[str]) -> pd.DataFrame: """Read the files and construct a dataframe of files and timestamp of record in them. :param filelist: list, of full path name files, typically from pyarrow's dataset.files :returns: dataframe of filename with the time it represents, sorted :rtype: pandas.DataFrame """ if not filelist: return pd.DataFrame(columns=['file', 'timestamp']) # We can't rely on the system istat time to find the times involved # So read the data for each block and check. We tried using threading # and it didn't dramatically alter the results. Given that we might've # too many threads running with the poller and everything, we skipped # doing it. fname_list = [] fts_list = [] for file in filelist: try: ts = pd.read_parquet(file, columns=['timestamp']) fts_list.append(ts.timestamp.min()) fname_list.append(file) except OSError: # skip this file because it can't be read, is probably 0 bytes logging.debug(f"not reading timestamp for {file}") # Construct file dataframe as its simpler to deal with if fname_list: fdf = pd.DataFrame({'file': fname_list, 'timestamp': fts_list}) fdf['timestamp'] = humanize_timestamp(fdf.timestamp, 'UTC') return fdf.sort_values(by=['timestamp']) return pd.DataFrame(['file', 'timestamp'])
def _write_verify_transform(mod_df, table, dbeng, schema, config_file, query_str_list, changed_fields): """Write and verify that the written data is present :param mod_df: pd.DataFrame, the modified dataframe to write :param table: str, the name of the table to write :param dbeng: SqParquetDB, pointer to DB class to write/read :param schema: SchemaForTable, Schema of data to be written :param config_file: str, Filename where suzieq config is stored :param query_str_list: List[str], query string if any to apply to data for verification check :param changed_fields: set, list of changed fields to verify :returns: Nothing :rtype: """ mod_df = mod_df.reset_index(drop=True) mod_df.timestamp = mod_df.timestamp.view(np.int64) mod_df.timestamp = mod_df.timestamp // 1000000 mod_df.sqvers = mod_df.sqvers.astype(str) dbeng.write(table, 'pandas', mod_df, False, schema.get_arrow_schema(), None) # Verify that what we wrote is what we got back mod_df.sqvers = mod_df.sqvers.astype(float) tblobj = get_sqobject(table) post_read_df = tblobj(config_file=config_file).get(columns=schema.fields) assert (not post_read_df.empty) # If the data was built up as a series of queries, we have to # apply the queries to verify that we have what we wrote dfconcat = None if query_str_list: for qstr in query_str_list: qdf = post_read_df.query(qstr).reset_index(drop=True) assert (not qdf.empty) if dfconcat is not None: dfconcat = pd.concat([dfconcat, qdf]) else: dfconcat = qdf if dfconcat is not None: qdf = dfconcat.set_index(schema.key_fields()) \ .sort_index() else: qdf = post_read_df.set_index(schema.key_fields()) \ .sort_index() mod_df = mod_df.set_index(schema.key_fields()) \ .query('~index.duplicated(keep="last")') \ .sort_index() mod_df.timestamp = humanize_timestamp(mod_df.timestamp, 'GMT') # We can't call assert_df_equal directly and so we # compare this way. The catch is if we accidentally # change some of the unchanged fields assert (mod_df.shape == qdf.shape) assert (not [ x for x in mod_df.columns.tolist() if x not in qdf.columns.tolist() ]) assert ((mod_df.index == qdf.index).all()) assert_df_equal(mod_df[changed_fields].reset_index(), qdf[changed_fields].reset_index(), None)
def get(self, **kwargs): """Get the information requested""" view = kwargs.get('view', self.iobj.view) columns = kwargs.get('columns', ['default']) addnl_fields = kwargs.pop('addnl_fields', []) user_query = kwargs.pop('query_str', '') status = kwargs.pop('status', '') os_version = kwargs.pop('version', '') vendor = kwargs.get('vendor', '') model = kwargs.get('model', '') os = kwargs.get('os', '') drop_cols = [] if 'active' not in addnl_fields+columns and columns != ['*']: addnl_fields.append('active') drop_cols.append('active') # os is not included in the default column list. Why? I was dumb if (columns == ['default'] and os) or (os and 'os' not in columns): addnl_fields.append('os') drop_cols.append('os') for col in ['namespace', 'hostname', 'status', 'address']: if columns not in [['default'], ['*']] and col not in columns: addnl_fields.append(col) drop_cols.append(col) if columns == ['*'] or 'uptime' in columns: if columns != ['*'] and 'bootupTimestamp' not in columns: addnl_fields.append('bootupTimestamp') drop_cols.append('bootupTimestamp') df = super().get(active_only=False, addnl_fields=addnl_fields, **kwargs) if view == 'latest' and 'status' in df.columns: df['status'] = np.where(df.active, df['status'], 'dead') poller_df = self._get_table_sqobj('sqPoller').get( namespace=kwargs.get('namespace', []), hostname=kwargs.get('hostname', []), service='device', columns='namespace hostname status'.split()) if not poller_df.empty: # Identify the address to namespace/hostname mapping addr_dict = {f"{x['namespace']}-{x['address']}": x['hostname'] for x in df[['namespace', 'address', 'hostname']] .to_dict(orient='records')} poller_df['hostname'] = poller_df.apply( lambda x, y: y.get(f"{x['namespace']}-{x['hostname']}", x['hostname']), args=(addr_dict,), axis=1) poller_df = poller_df\ .drop_duplicates(subset=['namespace', 'hostname'], keep='last') \ .reset_index(drop=True) df = df.merge(poller_df, on=['namespace', 'hostname'], how='outer', suffixes=['', '_y']) \ .fillna({'bootupTimestamp': 0, 'timestamp': 0, 'active': True}) \ .fillna('N/A') df.status = np.where( (df['status_y'] != 0) & (df['status_y'] != 200) & (df['status'] == "N/A"), 'neverpoll', df['status']) df = df[df.status != 'N/A'] df.timestamp = np.where(df['timestamp'] == 0, df['timestamp_y'], df['timestamp']) if 'address' in df.columns: df.address = np.where(df['address'] == 'N/A', df['hostname'], df['address']) drop_cols.extend(['status_y', 'timestamp_y']) if 'uptime' in columns or columns == ['*']: uptime_cols = (df['timestamp'] - humanize_timestamp(df['bootupTimestamp']*1000, self.cfg.get('analyzer', {}).get('timezone', None))) uptime_cols = pd.to_timedelta(uptime_cols, unit='s') df.insert(len(df.columns)-1, 'uptime', uptime_cols) if df.empty: return df # The poller merge kills the filtering we did earlier, so redo: if status: df = df.loc[df.status.isin(status)] if vendor: df = df.loc[df.vendor.isin(vendor)] if model: df = df.loc[df.model.isin(model)] if os: df = df.loc[df.os.isin(os)] if os_version: opdict = {'>': operator.gt, '<': operator.lt, '>=': operator.ge, '<=': operator.le, '=': operator.eq, '!=': operator.ne} op = operator.eq for elem, val in opdict.items(): if os_version.startswith(elem): os_version = os_version.replace(elem, '') op = val break df = df.loc[df.version.apply( lambda x: op(version.LegacyVersion(x), version.LegacyVersion(os_version)))] df = self._handle_user_query_str(df, user_query) # if poller has failed completely, Can mess up the order of columns cols = self.iobj.schema.get_display_fields(columns) if columns == ['default'] and 'timestamp' not in cols: cols.append('timestamp') if 'sqvers' in cols: cols.remove('sqvers') return df.drop(columns=drop_cols, errors='ignore')[cols]
def summarize(self, **kwargs) -> pd.DataFrame: """Summarize key information about BGP""" self._init_summarize(**kwargs) if self.summary_df.empty or ('error' in self.summary_df.columns): return self.summary_df self.summary_df['afiSafi'] = (self.summary_df['afi'] + ' ' + self.summary_df['safi']) afi_safi_count = self.summary_df.groupby(by=['namespace'])['afiSafi'] \ .nunique() self.summary_df = self.summary_df \ .set_index(['namespace', 'hostname', 'vrf', 'peer']) \ .query('~index.duplicated(keep="last")') \ .reset_index() self.ns = {i: {} for i in self.summary_df['namespace'].unique()} self.nsgrp = self.summary_df.groupby(by=["namespace"], observed=True) self._summarize_on_add_field = [('deviceCnt', 'hostname', 'nunique'), ('totalPeerCnt', 'peer', 'count'), ('uniqueAsnCnt', 'asn', 'nunique'), ('uniqueVrfsCnt', 'vrf', 'nunique')] self._summarize_on_add_with_query = [ ('failedPeerCnt', 'state == "NotEstd"', 'peer'), ('iBGPPeerCnt', 'asn == peerAsn', 'peer'), ('eBGPPeerCnt', 'asn != peerAsn', 'peer'), ('rrClientPeerCnt', 'rrclient.str.lower() == "true"', 'peer', 'count'), ] self._gen_summarize_data() # pylint: disable=expression-not-assigned { self.ns[i].update({'activeAfiSafiCnt': afi_safi_count[i]}) for i in self.ns.keys() } self.summary_row_order.append('activeAfiSafiCnt') self.summary_df['estdTime'] = humanize_timestamp( self.summary_df.estdTime, self.cfg.get('analyzer', {}).get('timezone', None)) self.summary_df['estdTime'] = (self.summary_df['timestamp'] - self.summary_df['estdTime']) self.summary_df['estdTime'] = self.summary_df['estdTime'] \ .apply(lambda x: x.round('s')) # Now come the BGP specific ones established = self.summary_df.query("state == 'Established'") \ .groupby(by=['namespace']) uptime = established["estdTime"] rx_updates = established["updatesRx"] tx_updates = established["updatesTx"] self._add_stats_to_summary(uptime, 'upTimeStat') self._add_stats_to_summary(rx_updates, 'updatesRxStat') self._add_stats_to_summary(tx_updates, 'updatesTxStat') self.summary_row_order.extend( ['upTimeStat', 'updatesRxStat', 'updatesTxStat']) self._post_summarize() return self.ns_df.convert_dtypes()
def get_valid_df(self, table: str, **kwargs) -> pd.DataFrame: """The heart of the engine: retrieving the data from the backing store Args: table (str): Name of the table to retrieve the data for Returns: pd.DataFrame: The data as a pandas dataframe """ if not self.ctxt.engine: print("Specify an analysis engine using set engine command") return pd.DataFrame(columns=["namespace", "hostname"]) # Thanks to things like OSPF, we cannot use self.schema here sch = SchemaForTable(table, self.all_schemas) phy_table = sch.get_phy_table_for_table() columns = kwargs.pop('columns', ['default']) addnl_fields = kwargs.pop('addnl_fields', []) view = kwargs.pop('view', self.iobj.view) active_only = kwargs.pop('active_only', True) hostname = kwargs.pop('hostname', []) fields = sch.get_display_fields(columns) key_fields = sch.key_fields() drop_cols = [] if columns == ['*']: drop_cols.append('sqvers') aug_fields = sch.get_augmented_fields() if 'timestamp' not in fields: fields.append('timestamp') if 'active' not in fields + addnl_fields: addnl_fields.append('active') if view != 'all': drop_cols.append('active') # Order matters. Don't put this before the missing key fields insert for f in aug_fields: dep_fields = sch.get_parent_fields(f) addnl_fields += dep_fields for fld in key_fields: if fld not in fields + addnl_fields: addnl_fields.insert(0, fld) drop_cols.append(fld) for f in addnl_fields: if f not in fields: # timestamp is always the last field fields.insert(-1, f) if self.iobj.start_time: try: start_time = int( dateparser.parse( self.iobj.start_time.replace( 'last night', 'yesterday')).timestamp() * 1000) except Exception: # pylint disable=raise-missing-from raise ValueError( f"unable to parse start-time: {self.iobj.start_time}") else: start_time = '' if self.iobj.start_time and not start_time: # Something went wrong with our parsing # pylint disable=raise-missing-from raise ValueError( f"unable to parse start-time: {self.iobj.start_time}") if self.iobj.end_time: try: end_time = int( dateparser.parse( self.iobj.end_time.replace( 'last night', 'yesterday')).timestamp() * 1000) except Exception: # pylint disable=raise-missing-from raise ValueError( f"unable to parse end-time: {self.iobj.end_time}") else: end_time = '' if self.iobj.end_time and not end_time: # Something went wrong with our parsing # pylint disable=raise-missing-from raise ValueError(f"unable to parse end-time: {self.iobj.end_time}") table_df = self._dbeng.read(phy_table, 'pandas', start_time=start_time, end_time=end_time, columns=fields, view=view, key_fields=key_fields, **kwargs) if not table_df.empty: # hostname may not have been filtered if using regex if hostname: hdf_list = [] for hn in hostname: if hn.startswith('~'): hn = hn[1:] df1 = table_df.query(f"hostname.str.match('{hn}')") if not df1.empty: hdf_list.append(df1) if hdf_list: table_df = pd.concat(hdf_list) else: return pd.DataFrame(columns=table_df.columns.tolist()) if view == "all" or not active_only: table_df.drop(columns=drop_cols, inplace=True) else: table_df = table_df.query('active') \ .drop(columns=drop_cols) if 'timestamp' in table_df.columns and not table_df.empty: table_df['timestamp'] = humanize_timestamp( table_df.timestamp, self.cfg.get('analyzer', {}).get('timezone', None)) return table_df