def get_table_info(self, table, **kwargs): sch = SchemaForTable(table, schema=self.schemas) key_fields = sch.key_fields() # You can't use view from user because we need to see all the data # to compute data required. kwargs.pop('view', None) all_time_df = self._get_table_info(table, view='all', **kwargs) times = all_time_df['timestamp'].unique() ret = {'first_time': all_time_df.timestamp.min(), 'latest_time': all_time_df.timestamp.max(), 'intervals': len(times), 'all rows': len(all_time_df), 'namespaces': self._unique_or_zero(all_time_df, 'namespace'), 'devices': self._unique_or_zero(all_time_df, 'hostname')} return ret
class SqObject(object): def __init__(self, engine_name: str = 'pandas', hostname: typing.List[str] = [], start_time: str = '', end_time: str = '', view: str = 'latest', namespace: typing.List[str] = [], columns: typing.List[str] = ['default'], context=None, table: str = '', config_file=None) -> None: if context is None: self.ctxt = SqContext(engine_name, config_file) else: self.ctxt = context if not self.ctxt: self.ctxt = SqContext(engine_name) self._cfg = self.ctxt.cfg self._schema = SchemaForTable(table, self.ctxt.schemas) self._table = table self._sort_fields = self._schema.key_fields() if not namespace and self.ctxt.namespace: self.namespace = self.ctxt.namespace else: self.namespace = namespace if not hostname and self.ctxt.hostname: self.hostname = self.ctxt.hostname else: self.hostname = hostname if not start_time and self.ctxt.start_time: self.start_time = self.ctxt.start_time else: self.start_time = start_time if not end_time and self.ctxt.end_time: self.end_time = self.ctxt.end_time else: self.end_time = end_time if not view and self.ctxt.view: self.view = self.ctxt.view else: self.view = view self.columns = columns if engine_name and engine_name != '': self.engine = get_sqengine(engine_name, self._table)(self._table, self) elif self.ctxt.engine: self.engine = get_sqengine(self.ctxt.engine, self._table)(self._table, self) if not self.engine: raise ValueError('Unknown analysis engine') self._addnl_filter = None self._addnl_fields = [] self._valid_get_args = None self._valid_assert_args = None self._valid_arg_vals = None @property def all_schemas(self): return self.ctxt.schemas @property def schema(self): return self._schema @property def cfg(self): return self._cfg @property def table(self): return self._table def _check_input_for_valid_args( self, good_arg_list, **kwargs, ): if not good_arg_list: return # add standard args that are always good_arg_list = good_arg_list + (['namespace', 'addnl_fields']) for arg in kwargs.keys(): if arg not in good_arg_list: raise AttributeError( f"argument {arg} not supported for this command") def _check_input_for_valid_vals(self, good_arg_val_list, **kwargs): '''Check if the input is valid for the arg, if possible''' if not good_arg_val_list: return for arg in kwargs.keys(): if arg in good_arg_val_list: if kwargs[arg] not in good_arg_val_list[arg]: raise AttributeError( f"invalid value {kwargs[arg]} for argument {arg}") def validate_get_input(self, **kwargs): self._check_input_for_valid_args(self._valid_get_args + ['columns'], **kwargs) self._check_input_for_valid_vals(self._valid_arg_vals, **kwargs) def validate_assert_input(self, **kwargs): self._check_input_for_valid_args(self._valid_assert_args, **kwargs) def get(self, **kwargs) -> pd.DataFrame: if not self._table: raise NotImplementedError if not self.ctxt.engine: raise AttributeError('No analysis engine specified') if self._addnl_filter: kwargs['add_filter'] = self._addnl_filter # This raises exceptions if it fails try: self.validate_get_input(**kwargs) except Exception as error: df = pd.DataFrame({'error': [f'{error}']}) return df return self.engine.get(**kwargs) def summarize(self, namespace=[], hostname=[], query_str='') -> pd.DataFrame: if self.columns != ["default"]: self.summarize_df = pd.DataFrame({ 'error': ['ERROR: You cannot specify columns with summarize'] }) return self.summarize_df if not self._table: raise NotImplementedError if not self.ctxt.engine: raise AttributeError('No analysis engine specified') return self.engine.summarize(namespace=namespace, hostname=hostname, query_str=query_str) def unique(self, **kwargs) -> pd.DataFrame: if not self._table: raise NotImplementedError if not self.ctxt.engine: raise AttributeError('No analysis engine specified') columns = kwargs.pop('columns', self.columns) return self.engine.unique(**kwargs, columns=columns) def analyze(self, **kwargs): raise NotImplementedError def aver(self, **kwargs): raise NotImplementedError def top(self, what='', n=5, reverse=False, **kwargs) -> pd.DataFrame: """Get the list of top/bottom entries of "what" field""" if "columns" in kwargs: columns = kwargs["columns"] del kwargs["columns"] else: columns = ["default"] # if self._valid_get_args: # self._valid_get_args += ['what', 'n', 'reverse'] # This raises exceptions if it fails try: self.validate_get_input(**kwargs) except Exception as error: df = pd.DataFrame({'error': [f'{error}']}) return df table_schema = SchemaForTable(self._table, self.all_schemas) columns = table_schema.get_display_fields(columns) if what not in columns: self._addnl_fields.append(what) return self.engine.top(what=what, n=n, reverse=reverse, **kwargs) def humanize_fields(self, df: pd.DataFrame, subset=None) -> pd.DataFrame: '''Humanize the fields for human consumption. Individual classes will implement the right transofmations. This routine is just a placeholder for all those with nothing to modify. ''' return df
def get_valid_df(self, table, **kwargs) -> pd.DataFrame: if not self.ctxt.engine: print("Specify an analysis engine using set engine command") return pd.DataFrame(columns=["namespace", "hostname"]) sch = SchemaForTable(table, schema=self.schemas) phy_table = sch.get_phy_table_for_table() columns = kwargs.pop('columns', ['default']) addnl_fields = kwargs.pop('addnl_fields', []) view = kwargs.pop('view', self.iobj.view) active_only = kwargs.pop('active_only', True) fields = sch.get_display_fields(columns) key_fields = sch.key_fields() drop_cols = [] if columns == ['*']: drop_cols.append('sqvers') aug_fields = sch.get_augmented_fields() if 'timestamp' not in fields: fields.append('timestamp') if 'active' not in fields + addnl_fields: addnl_fields.append('active') drop_cols.append('active') # Order matters. Don't put this before the missing key fields insert for f in aug_fields: dep_fields = sch.get_parent_fields(f) addnl_fields += dep_fields for fld in key_fields: if fld not in fields + addnl_fields: addnl_fields.insert(0, fld) drop_cols.append(fld) for f in addnl_fields: if f not in fields: # timestamp is always the last field fields.insert(-1, f) if self.iobj.start_time: try: start_time = dateparser.parse( self.iobj.start_time.replace('last night', 'yesterday')) \ .timestamp()*1000 except Exception as e: print(f"ERROR: invalid time {self.iobj.start_time}: {e}") return pd.DataFrame() else: start_time = '' if self.iobj.start_time and not start_time: # Something went wrong with our parsing print(f"ERROR: unable to parse {self.iobj.start_time}") return pd.DataFrame() if self.iobj.end_time: try: end_time = dateparser.parse( self.iobj.end_time.replace('last night', 'yesterday')) \ .timestamp()*1000 except Exception as e: print(f"ERROR: invalid time {self.iobj.end_time}: {e}") return pd.DataFrame() else: end_time = '' if self.iobj.end_time and not end_time: # Something went wrong with our parsing print(f"ERROR: Unable to parse {self.iobj.end_time}") return pd.DataFrame() table_df = self._dbeng.read(phy_table, 'pandas', start_time=start_time, end_time=end_time, columns=fields, view=view, key_fields=key_fields, **kwargs) if not table_df.empty: if view == "all" or not active_only: table_df.drop(columns=drop_cols, inplace=True) else: table_df = table_df.query('active') \ .drop(columns=drop_cols) if 'timestamp' in table_df.columns and not table_df.empty: table_df['timestamp'] = humanize_timestamp( table_df.timestamp, self.cfg.get('analyzer', {}).get('timezone', None)) return table_df
def _get_combined_df(self, **kwargs): """OSPF has info divided across multiple tables. Get a single one""" columns = kwargs.pop('columns', ['default']) state = kwargs.pop('state', '') addnl_fields = kwargs.pop('addnl_fields', self.iobj._addnl_fields) addnl_nbr_fields = self.iobj._addnl_nbr_fields cols = SchemaForTable('ospf', schema=self.schemas) \ .get_display_fields(columns) if columns == ['default']: cols.append('timestamp') ifschema = SchemaForTable('ospfIf', schema=self.schemas) nbrschema = SchemaForTable('ospfNbr', schema=self.schemas) if (columns != ['default']) and (columns != ['*']): ifkeys = ifschema.key_fields() nbrkeys = nbrschema.key_fields() if_flds = ifschema.fields nbr_flds = nbrschema.fields ifcols = ifkeys nbrcols = nbrkeys for fld in columns: if fld in if_flds and fld not in ifcols: ifcols.append(fld) elif fld in nbr_flds and fld not in nbrcols: nbrcols.append(fld) else: ifcols = ifschema.get_display_fields(columns) nbrcols = nbrschema.get_display_fields(columns) if state == "full": query_str = 'adjState == "full" or adjState == "passive"' elif state == "other": query_str = 'adjState != "full" and adjState != "passive"' elif state == "passive": query_str = 'adjState == "passive"' else: query_str = '' df = self.get_valid_df('ospfIf', addnl_fields=addnl_fields, columns=ifcols, **kwargs) nbr_df = self.get_valid_df('ospfNbr', addnl_fields=addnl_nbr_fields, columns=nbrcols, **kwargs) if nbr_df.empty: return nbr_df merge_cols = [ x for x in ['namespace', 'hostname', 'ifname'] if x in nbr_df.columns ] # Merge the two tables df = df.merge(nbr_df, on=merge_cols, how='left') if columns == ['*']: df = df.drop(columns=['area_y', 'instance_y', 'vrf_y', 'areaStub_y', 'timestamp_y']) \ .rename(columns={ 'instance_x': 'instance', 'areaStub_x': 'areaStub', 'area_x': 'area', 'vrf_x': 'vrf', 'state_x': 'ifState', 'state_y': 'adjState', 'sqvers_x': 'sqvers', 'active_x': 'active', 'timestamp_x': 'timestamp'}) else: df = df.rename( columns={ 'vrf_x': 'vrf', 'area_x': 'area', 'state_x': 'ifState', 'state_y': 'adjState', 'timestamp_x': 'timestamp' }) df = df.drop(list(df.filter(regex='_y$')), axis=1) \ .fillna({'peerIP': '-', 'numChanges': 0, 'lastChangeTime': 0}) # Fill the adjState column with passive if passive if 'passive' in df.columns: df.loc[df['adjState'].isnull(), 'adjState'] = df['passive'] df.loc[df['adjState'].eq(True), 'adjState'] = 'passive' df.loc[df['adjState'].eq(False), 'adjState'] = 'fail' df.drop(columns=['passive'], inplace=True) df.bfill(axis=0, inplace=True) # Move the timestamp column to the end if query_str: return df.query(query_str)[cols] return df[cols]
def get_valid_df(self, table, **kwargs) -> pd.DataFrame: if not self.ctxt.engine: print("Specify an analysis engine using set engine command") return pd.DataFrame(columns=["namespace", "hostname"]) sch = SchemaForTable(table, schema=self.schemas) phy_table = sch.get_phy_table_for_table() columns = kwargs.pop('columns', ['default']) addnl_fields = kwargs.pop('addnl_fields', []) view = kwargs.pop('view', self.iobj.view) active_only = kwargs.pop('active_only', True) fields = sch.get_display_fields(columns) key_fields = sch.key_fields() drop_cols = [] if 'timestamp' not in fields: fields.append('timestamp') if 'active' not in fields + addnl_fields: addnl_fields.append('active') drop_cols.append('active') for fld in key_fields: if fld not in fields + addnl_fields: addnl_fields.insert(0, fld) drop_cols.append(fld) for f in addnl_fields: if f not in fields: # timestamp is always the last field fields.insert(-1, f) for dt in [self.iobj.start_time, self.iobj.end_time]: if dt: try: parse(dt) except (ValueError, ParserError) as e: print(f"invalid time {dt}: {e}") return pd.DataFrame() table_df = self.ctxt.engine.get_table_df( self.cfg, table=phy_table, start_time=self.iobj.start_time, end_time=self.iobj.end_time, columns=fields, view=view, key_fields=key_fields, **kwargs) if not table_df.empty: if view == 'latest' and active_only: table_df = table_df.query('active') \ .drop(columns=drop_cols) else: table_df.drop(columns=drop_cols, inplace=True) if 'timestamp' in table_df.columns: table_df['timestamp'] = pd.to_datetime( table_df.timestamp.astype(str), unit="ms") return table_df
def _get_combined_df(self, **kwargs): """OSPF has info divided across multiple tables. Get a single one""" columns = kwargs.pop('columns', ['default']) state = kwargs.pop('state', '') addnl_fields = kwargs.pop('addnl_fields', self.iobj._addnl_fields) addnl_nbr_fields = self.iobj._addnl_nbr_fields user_query = kwargs.pop('query_str', '') cols = SchemaForTable('ospf', schema=self.schemas) \ .get_display_fields(columns) if columns == ['default']: cols.append('timestamp') ifschema = SchemaForTable('ospfIf', schema=self.schemas) nbrschema = SchemaForTable('ospfNbr', schema=self.schemas) if (columns != ['default']) and (columns != ['*']): ifkeys = ifschema.key_fields() nbrkeys = nbrschema.key_fields() if_flds = ifschema.fields nbr_flds = nbrschema.fields ifcols = ifkeys nbrcols = nbrkeys for fld in columns: if fld in if_flds and fld not in ifcols: ifcols.append(fld) elif fld in nbr_flds and fld not in nbrcols: nbrcols.append(fld) else: ifcols = ifschema.get_display_fields(columns) nbrcols = nbrschema.get_display_fields(columns) if state == "full": query_str = 'adjState == "full" or adjState == "passive"' elif state == "other": query_str = 'adjState != "full" and adjState != "passive"' elif state == "passive": query_str = 'adjState == "passive"' else: query_str = '' df = self.get_valid_df('ospfIf', addnl_fields=addnl_fields, columns=ifcols, **kwargs) nbr_df = self.get_valid_df('ospfNbr', addnl_fields=addnl_nbr_fields, columns=nbrcols, **kwargs) if nbr_df.empty: return df merge_cols = [ x for x in ['namespace', 'hostname', 'ifname'] if x in nbr_df.columns ] # Merge the two tables df = df.merge(nbr_df, on=merge_cols, how='left') # This is because some NOS have the ipAddress in nbr table and some in # interface table. Nbr table wins over interface table if present if 'ipAddress_y' in df: df['ipAddress'] = np.where(df['ipAddress_y'] == "", df['ipAddress_x'], df['ipAddress_y']) df['ipAddress'] = np.where(df['ipAddress'], df['ipAddress'], df['ipAddress_x']) if columns == ['*']: df = df.drop(columns=['area_y', 'instance_y', 'vrf_y', 'ipAddress_x', 'ipAddress_y', 'areaStub_y', 'timestamp_y'], errors='ignore') \ .rename(columns={ 'instance_x': 'instance', 'areaStub_x': 'areaStub', 'area_x': 'area', 'vrf_x': 'vrf', 'state_x': 'ifState', 'state_y': 'adjState', 'sqvers_x': 'sqvers', 'active_x': 'active', 'timestamp_x': 'timestamp'}) else: df = df.rename( columns={ 'vrf_x': 'vrf', 'area_x': 'area', 'state_x': 'ifState', 'state_y': 'adjState', 'timestamp_x': 'timestamp' }) df = df.drop(list(df.filter(regex='_y$')), axis=1) \ .drop('ipAddress_x', axis=1, errors='ignore') \ .fillna({'peerIP': '-', 'numChanges': 0, 'lastChangeTime': 0}) # Fill the adjState column with passive if passive if 'passive' in df.columns: df.loc[df['adjState'].isnull(), 'adjState'] = df['passive'] df.loc[df['adjState'].eq(True), 'adjState'] = 'passive' df.loc[df['adjState'].eq(False), 'adjState'] = 'fail' df.drop(columns=['passive'], inplace=True) df.bfill(axis=0, inplace=True) if 'peerHostname' in columns or (columns in [['*'], ['default']]): nfdf = df.query('adjState != "full"').reset_index() nfdf['peerHostname'] = '' newdf = df.query('adjState == "full"').reset_index() \ .drop('peerHostname', axis=1, errors='ignore') if not newdf.empty: newdf['matchIP'] = newdf.ipAddress.str.split('/').str[0] newdf = newdf.merge(newdf[['namespace', 'hostname', 'vrf', 'matchIP']], left_on=['namespace', 'vrf', 'peerIP'], right_on=['namespace', 'vrf', 'matchIP'], suffixes=["", "_y"]) \ .rename(columns={'hostname_y': 'peerHostname'}) \ .drop_duplicates(subset=['namespace', 'hostname', 'vrf', 'ifname']) \ .drop(columns=['matchIP', 'matchIP_y'], errors='ignore') if newdf.empty: newdf = df.query('adjState == "full"').reset_index() newdf['peerHostname'] = '' final_df = pd.concat([nfdf, newdf]) else: final_df = df else: final_df = df if query_str: final_df = final_df.query(query_str).reset_index(drop=True) if user_query and not final_df.empty: final_df = self._handle_user_query_str(final_df, user_query) # Move the timestamp column to the end return final_df[cols]
class SqObject(object): def __init__(self, engine_name: str = '', hostname: typing.List[str] = [], start_time: str = '', end_time: str = '', view: str = 'latest', namespace: typing.List[str] = [], columns: typing.List[str] = ['default'], context=None, table: str = '', config_file=None) -> None: if context is None: self.ctxt = SqContext(engine_name, config_file) else: self.ctxt = context if not self.ctxt: self.ctxt = SqContext(engine_name) self._cfg = self.ctxt.cfg self._schema = SchemaForTable(table, self.ctxt.schemas) self._table = table self._sort_fields = self._schema.key_fields() if not namespace and self.ctxt.namespace: self.namespace = self.ctxt.namespace else: self.namespace = namespace if not hostname and self.ctxt.hostname: self.hostname = self.ctxt.hostname else: self.hostname = hostname if not start_time and self.ctxt.start_time: self.start_time = self.ctxt.start_time else: self.start_time = start_time if not end_time and self.ctxt.end_time: self.end_time = self.ctxt.end_time else: self.end_time = end_time if not view and self.ctxt.view: self.view = self.ctxt.view else: self.view = view self.columns = columns if engine_name and engine_name != '': self.engine = get_sqengine(engine_name) else: self.engine = self.ctxt.engine if self._table: self.engine_obj = self.engine.get_object(self._table, self) else: self.engine_obj = None self._addnl_filter = None self._addnl_fields = [] @property def all_schemas(self): return self.ctxt.schemas @property def schema(self): return self._schema @property def cfg(self): return self._cfg @property def table(self): return self._table def validate_input(self, **kwargs): """Dummy validate input""" return def get(self, **kwargs) -> pd.DataFrame: if not self._table: raise NotImplementedError if not self.ctxt.engine: raise AttributeError('No analysis engine specified') if self._addnl_filter: kwargs['add_filter'] = self._addnl_filter # This raises exceptions if it fails try: self.validate_input(**kwargs) except Exception as error: df = pd.DataFrame({'error': [f'{error}']}) return df return self.engine_obj.get(**kwargs) def summarize(self, namespace='') -> pd.DataFrame: if not self._table: raise NotImplementedError if not self.ctxt.engine: raise AttributeError('No analysis engine specified') return self.engine_obj.summarize(namespace=namespace) def unique(self, **kwargs) -> pd.DataFrame: if not self._table: raise NotImplementedError if not self.ctxt.engine: raise AttributeError('No analysis engine specified') return self.engine_obj.unique(**kwargs) def analyze(self, **kwargs): raise NotImplementedError def aver(self, **kwargs): raise NotImplementedError def top(self, what='', n=5, reverse=False, **kwargs) -> pd.DataFrame: """Get the list of top/bottom entries of "what" field""" if "columns" in kwargs: columns = kwargs["columns"] del kwargs["columns"] else: columns = ["default"] table_schema = SchemaForTable(self._table, self.all_schemas) columns = table_schema.get_display_fields(columns) if what not in columns: self._addnl_fields.append(what) return self.engine_obj.top(what=what, n=n, reverse=reverse, **kwargs)
def get_valid_df(self, table, **kwargs) -> pd.DataFrame: if not self.ctxt.engine: print("Specify an analysis engine using set engine command") return pd.DataFrame(columns=["namespace", "hostname"]) sch = SchemaForTable(table, schema=self.schemas) phy_table = sch.get_phy_table_for_table() columns = kwargs.pop('columns', ['default']) addnl_fields = kwargs.pop('addnl_fields', []) view = kwargs.pop('view', self.iobj.view) active_only = kwargs.pop('active_only', True) query_str = kwargs.pop('query_str', '') # The REST API provides the query_str enclosed in ". Strip that if query_str: if query_str.startswith('"') and query_str.endswith('"'): query_str = query_str[1:-1] fields = sch.get_display_fields(columns) key_fields = sch.key_fields() drop_cols = [] if columns == ['*']: drop_cols.append('sqvers') if 'timestamp' not in fields: fields.append('timestamp') if 'active' not in fields+addnl_fields: addnl_fields.append('active') drop_cols.append('active') for fld in key_fields: if fld not in fields+addnl_fields: addnl_fields.insert(0, fld) drop_cols.append(fld) for f in addnl_fields: if f not in fields: # timestamp is always the last field fields.insert(-1, f) if self.iobj.start_time: try: start_time = dateparser.parse( self.iobj.start_time.replace('last night', 'yesterday')) \ .timestamp()*1000 except Exception as e: print(f"ERROR: invalid time {self.iobj.start_time}: {e}") return pd.DataFrame() else: start_time = '' if self.iobj.start_time and not start_time: # Something went wrong with our parsing print(f"ERROR: unable to parse {self.iobj.start_time}") return pd.DataFrame() if self.iobj.end_time: try: end_time = dateparser.parse( self.iobj.end_time.replace('last night', 'yesterday')) \ .timestamp()*1000 except Exception as e: print(f"ERROR: invalid time {self.iobj.end_time}: {e}") return pd.DataFrame() else: end_time = '' if self.iobj.end_time and not end_time: # Something went wrong with our parsing print(f"ERROR: Unable to parse {self.iobj.end_time}") return pd.DataFrame() table_df = self._dbeng.read( phy_table, 'pandas', start_time=start_time, end_time=end_time, columns=fields, view=view, key_fields=key_fields, **kwargs ) if not table_df.empty: if view == 'latest' and active_only: table_df = table_df.query('active') \ .drop(columns=drop_cols) else: table_df.drop(columns=drop_cols, inplace=True) if 'timestamp' in table_df.columns: table_df['timestamp'] = humanize_timestamp( table_df.timestamp, self.cfg.get('analyzer', {}) .get('timezone', None)) if query_str: return table_df.query(query_str) else: return table_df
def get_table_df(self, cfg, schemas, **kwargs) -> pd.DataFrame: """Use Pandas instead of Spark to retrieve the data""" MAX_FILECNT_TO_READ_FOLDER = 10000 self.cfg = cfg table = kwargs.pop("table") start = kwargs.pop("start_time") end = kwargs.pop("end_time") view = kwargs.pop("view") sort_fields = kwargs.pop("sort_fields") ign_key_fields = kwargs.pop("ign_key", []) addnl_fields = kwargs.pop("addnl_fields", []) for f in ['active', 'timestamp']: if f not in addnl_fields: addnl_fields.append(f) sch = SchemaForTable(table, schema=schemas) phy_table = sch.get_phy_table_for_table() folder = self._get_table_directory(phy_table) # Restrict to a single DC if thats whats asked if "namespace" in kwargs: v = kwargs["namespace"] if v: if not isinstance(v, list): folder += "/namespace={}/".format(v) fcnt = self.get_filecnt(folder) if fcnt == 0: return pd.DataFrame() # We are going to hard code use_get_files until we have some autoamted testing use_get_files = False # use_get_files = ( # (fcnt > MAX_FILECNT_TO_READ_FOLDER and view == "latest") or # start or end # ) if use_get_files: # Switch to more efficient method when there are lotsa files # Reduce I/O since that is the worst drag key_fields = [] if len(kwargs.get("namespace", [])) > 1: del kwargs["namespace"] files = get_latest_files(folder, start, end, view) else: # ign_key_fields contains key fields that are not partition cols key_fields = [i for i in sch.key_fields() if i not in ign_key_fields] filters = self.build_pa_filters(start, end, key_fields, **kwargs) if "columns" in kwargs: columns = kwargs["columns"] del kwargs["columns"] else: columns = ["default"] fields = sch.get_display_fields(columns) for f in addnl_fields: if f not in fields: fields.append(f) # Create the filter to select only specified columns query_str = "" prefix = "" addnl_filter = kwargs.pop('add_filter', None) for f, v in kwargs.items(): if not v or f in key_fields or f in ["groupby"]: continue if isinstance(v, str): if v.startswith('!'): v = v[1:] op = '!=' else: op = '==' query_str += "{} {}{}'{}' ".format(prefix, f, op, v) prefix = "and" else: query_str += "{} {}=={} ".format(prefix, f, v) prefix = "and" # Add the ignored fields back to key fields to ensure we # do the drop_duplicates correctly below incl reading reqd cols key_fields.extend(ign_key_fields) # Handle the case where key fields are missing from display fields fldset = set(fields) kfldset = set(key_fields) add_flds = kfldset.difference(fldset) if add_flds: fields.extend(list(add_flds)) if addnl_filter: # This is for special cases that are specific to an object if not query_str: query_str = addnl_filter else: query_str += ' and {}'.format(addnl_filter) # Restore the folder to what it needs to be folder = self._get_table_directory(phy_table) if use_get_files: if not query_str: query_str = "active == True" pdf_list = [] with Executor(max_workers=8) as exe: jobs = [ exe.submit(self.read_pq_file, f, fields, query_str) for f in files ] pdf_list = [job.result() for job in jobs] if pdf_list: final_df = pd.concat(pdf_list) else: final_df = pd.DataFrame(columns=fields) elif view == "latest": if not query_str: # Make up a dummy query string to avoid if/then/else query_str = "timestamp != 0" try: final_df = ( pa.ParquetDataset( folder, filters=filters or None, validate_schema=False ) .read(columns=fields) .to_pandas(split_blocks=True, self_destruct=True) .query(query_str) .drop_duplicates(subset=key_fields, keep="last") .query("active == True") ) except pa.lib.ArrowInvalid: return pd.DataFrame(columns=fields) else: if not query_str: # Make up a dummy query string to avoid if/then/else query_str = 'timestamp != "0"' try: final_df = ( pa.ParquetDataset( folder, filters=filters or None, validate_schema=False ) .read(columns=fields) .to_pandas() .query(query_str) ) except pa.lib.ArrowInvalid: return pd.DataFrame(columns=fields) if 'active' not in columns: final_df.drop(columns=['active'], axis=1, inplace=True) fields.remove('active') final_df = df_timestamp_to_datetime(final_df) if sort_fields and all(x in sort_fields for x in fields): return final_df[fields].sort_values(by=sort_fields) else: return final_df[fields]