Example #1
0
    def get_table_info(self, table, **kwargs):
        sch = SchemaForTable(table, schema=self.schemas)
        key_fields = sch.key_fields()
        # You can't use view from user because we need to see all the data
        # to compute data required.
        kwargs.pop('view', None)
        all_time_df = self._get_table_info(table, view='all', **kwargs)
        times = all_time_df['timestamp'].unique()
        ret = {'first_time': all_time_df.timestamp.min(),
               'latest_time': all_time_df.timestamp.max(),
               'intervals': len(times),
               'all rows': len(all_time_df),
               'namespaces': self._unique_or_zero(all_time_df, 'namespace'),
               'devices': self._unique_or_zero(all_time_df, 'hostname')}

        return ret
Example #2
0
class SqObject(object):
    def __init__(self,
                 engine_name: str = 'pandas',
                 hostname: typing.List[str] = [],
                 start_time: str = '',
                 end_time: str = '',
                 view: str = 'latest',
                 namespace: typing.List[str] = [],
                 columns: typing.List[str] = ['default'],
                 context=None,
                 table: str = '',
                 config_file=None) -> None:

        if context is None:
            self.ctxt = SqContext(engine_name, config_file)
        else:
            self.ctxt = context
            if not self.ctxt:
                self.ctxt = SqContext(engine_name)

        self._cfg = self.ctxt.cfg
        self._schema = SchemaForTable(table, self.ctxt.schemas)
        self._table = table
        self._sort_fields = self._schema.key_fields()

        if not namespace and self.ctxt.namespace:
            self.namespace = self.ctxt.namespace
        else:
            self.namespace = namespace
        if not hostname and self.ctxt.hostname:
            self.hostname = self.ctxt.hostname
        else:
            self.hostname = hostname

        if not start_time and self.ctxt.start_time:
            self.start_time = self.ctxt.start_time
        else:
            self.start_time = start_time

        if not end_time and self.ctxt.end_time:
            self.end_time = self.ctxt.end_time
        else:
            self.end_time = end_time

        if not view and self.ctxt.view:
            self.view = self.ctxt.view
        else:
            self.view = view
        self.columns = columns

        if engine_name and engine_name != '':
            self.engine = get_sqengine(engine_name, self._table)(self._table,
                                                                 self)
        elif self.ctxt.engine:
            self.engine = get_sqengine(self.ctxt.engine,
                                       self._table)(self._table, self)

        if not self.engine:
            raise ValueError('Unknown analysis engine')

        self._addnl_filter = None
        self._addnl_fields = []
        self._valid_get_args = None
        self._valid_assert_args = None
        self._valid_arg_vals = None

    @property
    def all_schemas(self):
        return self.ctxt.schemas

    @property
    def schema(self):
        return self._schema

    @property
    def cfg(self):
        return self._cfg

    @property
    def table(self):
        return self._table

    def _check_input_for_valid_args(
        self,
        good_arg_list,
        **kwargs,
    ):
        if not good_arg_list:
            return

        # add standard args that are always
        good_arg_list = good_arg_list + (['namespace', 'addnl_fields'])

        for arg in kwargs.keys():
            if arg not in good_arg_list:
                raise AttributeError(
                    f"argument {arg} not supported for this command")

    def _check_input_for_valid_vals(self, good_arg_val_list, **kwargs):
        '''Check if the input is valid for the arg, if possible'''

        if not good_arg_val_list:
            return

        for arg in kwargs.keys():
            if arg in good_arg_val_list:
                if kwargs[arg] not in good_arg_val_list[arg]:
                    raise AttributeError(
                        f"invalid value {kwargs[arg]} for argument {arg}")

    def validate_get_input(self, **kwargs):
        self._check_input_for_valid_args(self._valid_get_args + ['columns'],
                                         **kwargs)
        self._check_input_for_valid_vals(self._valid_arg_vals, **kwargs)

    def validate_assert_input(self, **kwargs):
        self._check_input_for_valid_args(self._valid_assert_args, **kwargs)

    def get(self, **kwargs) -> pd.DataFrame:

        if not self._table:
            raise NotImplementedError

        if not self.ctxt.engine:
            raise AttributeError('No analysis engine specified')

        if self._addnl_filter:
            kwargs['add_filter'] = self._addnl_filter

        # This raises exceptions if it fails
        try:
            self.validate_get_input(**kwargs)
        except Exception as error:
            df = pd.DataFrame({'error': [f'{error}']})
            return df

        return self.engine.get(**kwargs)

    def summarize(self,
                  namespace=[],
                  hostname=[],
                  query_str='') -> pd.DataFrame:
        if self.columns != ["default"]:
            self.summarize_df = pd.DataFrame({
                'error': ['ERROR: You cannot specify columns with summarize']
            })
            return self.summarize_df
        if not self._table:
            raise NotImplementedError

        if not self.ctxt.engine:
            raise AttributeError('No analysis engine specified')

        return self.engine.summarize(namespace=namespace,
                                     hostname=hostname,
                                     query_str=query_str)

    def unique(self, **kwargs) -> pd.DataFrame:
        if not self._table:
            raise NotImplementedError

        if not self.ctxt.engine:
            raise AttributeError('No analysis engine specified')

        columns = kwargs.pop('columns', self.columns)
        return self.engine.unique(**kwargs, columns=columns)

    def analyze(self, **kwargs):
        raise NotImplementedError

    def aver(self, **kwargs):
        raise NotImplementedError

    def top(self, what='', n=5, reverse=False, **kwargs) -> pd.DataFrame:
        """Get the list of top/bottom entries of "what" field"""

        if "columns" in kwargs:
            columns = kwargs["columns"]
            del kwargs["columns"]
        else:
            columns = ["default"]

        # if self._valid_get_args:
        #     self._valid_get_args += ['what', 'n', 'reverse']
        # This raises exceptions if it fails
        try:
            self.validate_get_input(**kwargs)
        except Exception as error:
            df = pd.DataFrame({'error': [f'{error}']})
            return df

        table_schema = SchemaForTable(self._table, self.all_schemas)
        columns = table_schema.get_display_fields(columns)

        if what not in columns:
            self._addnl_fields.append(what)

        return self.engine.top(what=what, n=n, reverse=reverse, **kwargs)

    def humanize_fields(self, df: pd.DataFrame, subset=None) -> pd.DataFrame:
        '''Humanize the fields for human consumption.

        Individual classes will implement the right transofmations. This
        routine is just a placeholder for all those with nothing to modify.
        '''
        return df
Example #3
0
    def get_valid_df(self, table, **kwargs) -> pd.DataFrame:
        if not self.ctxt.engine:
            print("Specify an analysis engine using set engine command")
            return pd.DataFrame(columns=["namespace", "hostname"])

        sch = SchemaForTable(table, schema=self.schemas)
        phy_table = sch.get_phy_table_for_table()

        columns = kwargs.pop('columns', ['default'])
        addnl_fields = kwargs.pop('addnl_fields', [])
        view = kwargs.pop('view', self.iobj.view)
        active_only = kwargs.pop('active_only', True)

        fields = sch.get_display_fields(columns)
        key_fields = sch.key_fields()
        drop_cols = []

        if columns == ['*']:
            drop_cols.append('sqvers')

        aug_fields = sch.get_augmented_fields()

        if 'timestamp' not in fields:
            fields.append('timestamp')

        if 'active' not in fields + addnl_fields:
            addnl_fields.append('active')
            drop_cols.append('active')

        # Order matters. Don't put this before the missing key fields insert
        for f in aug_fields:
            dep_fields = sch.get_parent_fields(f)
            addnl_fields += dep_fields

        for fld in key_fields:
            if fld not in fields + addnl_fields:
                addnl_fields.insert(0, fld)
                drop_cols.append(fld)

        for f in addnl_fields:
            if f not in fields:
                # timestamp is always the last field
                fields.insert(-1, f)

        if self.iobj.start_time:
            try:
                start_time = dateparser.parse(
                    self.iobj.start_time.replace('last night', 'yesterday')) \
                    .timestamp()*1000
            except Exception as e:
                print(f"ERROR: invalid time {self.iobj.start_time}: {e}")
                return pd.DataFrame()
        else:
            start_time = ''

        if self.iobj.start_time and not start_time:
            # Something went wrong with our parsing
            print(f"ERROR: unable to parse {self.iobj.start_time}")
            return pd.DataFrame()

        if self.iobj.end_time:
            try:
                end_time = dateparser.parse(
                    self.iobj.end_time.replace('last night', 'yesterday')) \
                    .timestamp()*1000
            except Exception as e:
                print(f"ERROR: invalid time {self.iobj.end_time}: {e}")
                return pd.DataFrame()
        else:
            end_time = ''

        if self.iobj.end_time and not end_time:
            # Something went wrong with our parsing
            print(f"ERROR: Unable to parse {self.iobj.end_time}")
            return pd.DataFrame()

        table_df = self._dbeng.read(phy_table,
                                    'pandas',
                                    start_time=start_time,
                                    end_time=end_time,
                                    columns=fields,
                                    view=view,
                                    key_fields=key_fields,
                                    **kwargs)

        if not table_df.empty:
            if view == "all" or not active_only:
                table_df.drop(columns=drop_cols, inplace=True)
            else:
                table_df = table_df.query('active') \
                                   .drop(columns=drop_cols)
            if 'timestamp' in table_df.columns and not table_df.empty:
                table_df['timestamp'] = humanize_timestamp(
                    table_df.timestamp,
                    self.cfg.get('analyzer', {}).get('timezone', None))

        return table_df
Example #4
0
    def _get_combined_df(self, **kwargs):
        """OSPF has info divided across multiple tables. Get a single one"""

        columns = kwargs.pop('columns', ['default'])
        state = kwargs.pop('state', '')
        addnl_fields = kwargs.pop('addnl_fields', self.iobj._addnl_fields)
        addnl_nbr_fields = self.iobj._addnl_nbr_fields

        cols = SchemaForTable('ospf', schema=self.schemas) \
            .get_display_fields(columns)
        if columns == ['default']:
            cols.append('timestamp')

        ifschema = SchemaForTable('ospfIf', schema=self.schemas)
        nbrschema = SchemaForTable('ospfNbr', schema=self.schemas)

        if (columns != ['default']) and (columns != ['*']):
            ifkeys = ifschema.key_fields()
            nbrkeys = nbrschema.key_fields()
            if_flds = ifschema.fields
            nbr_flds = nbrschema.fields

            ifcols = ifkeys
            nbrcols = nbrkeys
            for fld in columns:
                if fld in if_flds and fld not in ifcols:
                    ifcols.append(fld)
                elif fld in nbr_flds and fld not in nbrcols:
                    nbrcols.append(fld)
        else:
            ifcols = ifschema.get_display_fields(columns)
            nbrcols = nbrschema.get_display_fields(columns)

        if state == "full":
            query_str = 'adjState == "full" or adjState == "passive"'
        elif state == "other":
            query_str = 'adjState != "full" and adjState != "passive"'
        elif state == "passive":
            query_str = 'adjState == "passive"'
        else:
            query_str = ''

        df = self.get_valid_df('ospfIf',
                               addnl_fields=addnl_fields,
                               columns=ifcols,
                               **kwargs)
        nbr_df = self.get_valid_df('ospfNbr',
                                   addnl_fields=addnl_nbr_fields,
                                   columns=nbrcols,
                                   **kwargs)
        if nbr_df.empty:
            return nbr_df

        merge_cols = [
            x for x in ['namespace', 'hostname', 'ifname']
            if x in nbr_df.columns
        ]
        # Merge the two tables
        df = df.merge(nbr_df, on=merge_cols, how='left')

        if columns == ['*']:
            df = df.drop(columns=['area_y', 'instance_y', 'vrf_y',
                                  'areaStub_y', 'timestamp_y']) \
                .rename(columns={
                    'instance_x': 'instance', 'areaStub_x': 'areaStub',
                    'area_x': 'area', 'vrf_x': 'vrf',
                    'state_x': 'ifState', 'state_y': 'adjState',
                    'sqvers_x': 'sqvers', 'active_x': 'active',
                    'timestamp_x': 'timestamp'})
        else:
            df = df.rename(
                columns={
                    'vrf_x': 'vrf',
                    'area_x': 'area',
                    'state_x': 'ifState',
                    'state_y': 'adjState',
                    'timestamp_x': 'timestamp'
                })
            df = df.drop(list(df.filter(regex='_y$')), axis=1) \
                .fillna({'peerIP': '-', 'numChanges': 0,
                         'lastChangeTime': 0})

        # Fill the adjState column with passive if passive
        if 'passive' in df.columns:
            df.loc[df['adjState'].isnull(), 'adjState'] = df['passive']
            df.loc[df['adjState'].eq(True), 'adjState'] = 'passive'
            df.loc[df['adjState'].eq(False), 'adjState'] = 'fail'
            df.drop(columns=['passive'], inplace=True)

        df.bfill(axis=0, inplace=True)

        # Move the timestamp column to the end
        if query_str:
            return df.query(query_str)[cols]
        return df[cols]
Example #5
0
    def get_valid_df(self, table, **kwargs) -> pd.DataFrame:
        if not self.ctxt.engine:
            print("Specify an analysis engine using set engine command")
            return pd.DataFrame(columns=["namespace", "hostname"])

        sch = SchemaForTable(table, schema=self.schemas)
        phy_table = sch.get_phy_table_for_table()

        columns = kwargs.pop('columns', ['default'])
        addnl_fields = kwargs.pop('addnl_fields', [])
        view = kwargs.pop('view', self.iobj.view)
        active_only = kwargs.pop('active_only', True)

        fields = sch.get_display_fields(columns)
        key_fields = sch.key_fields()
        drop_cols = []

        if 'timestamp' not in fields:
            fields.append('timestamp')

        if 'active' not in fields + addnl_fields:
            addnl_fields.append('active')
            drop_cols.append('active')

        for fld in key_fields:
            if fld not in fields + addnl_fields:
                addnl_fields.insert(0, fld)
                drop_cols.append(fld)

        for f in addnl_fields:
            if f not in fields:
                # timestamp is always the last field
                fields.insert(-1, f)

        for dt in [self.iobj.start_time, self.iobj.end_time]:
            if dt:
                try:
                    parse(dt)
                except (ValueError, ParserError) as e:
                    print(f"invalid time {dt}: {e}")
                    return pd.DataFrame()

        table_df = self.ctxt.engine.get_table_df(
            self.cfg,
            table=phy_table,
            start_time=self.iobj.start_time,
            end_time=self.iobj.end_time,
            columns=fields,
            view=view,
            key_fields=key_fields,
            **kwargs)

        if not table_df.empty:
            if view == 'latest' and active_only:
                table_df = table_df.query('active') \
                                   .drop(columns=drop_cols)
            else:
                table_df.drop(columns=drop_cols, inplace=True)
            if 'timestamp' in table_df.columns:
                table_df['timestamp'] = pd.to_datetime(
                    table_df.timestamp.astype(str), unit="ms")

        return table_df
Example #6
0
    def _get_combined_df(self, **kwargs):
        """OSPF has info divided across multiple tables. Get a single one"""

        columns = kwargs.pop('columns', ['default'])
        state = kwargs.pop('state', '')
        addnl_fields = kwargs.pop('addnl_fields', self.iobj._addnl_fields)
        addnl_nbr_fields = self.iobj._addnl_nbr_fields
        user_query = kwargs.pop('query_str', '')

        cols = SchemaForTable('ospf', schema=self.schemas) \
            .get_display_fields(columns)
        if columns == ['default']:
            cols.append('timestamp')

        ifschema = SchemaForTable('ospfIf', schema=self.schemas)
        nbrschema = SchemaForTable('ospfNbr', schema=self.schemas)

        if (columns != ['default']) and (columns != ['*']):
            ifkeys = ifschema.key_fields()
            nbrkeys = nbrschema.key_fields()
            if_flds = ifschema.fields
            nbr_flds = nbrschema.fields

            ifcols = ifkeys
            nbrcols = nbrkeys
            for fld in columns:
                if fld in if_flds and fld not in ifcols:
                    ifcols.append(fld)
                elif fld in nbr_flds and fld not in nbrcols:
                    nbrcols.append(fld)
        else:
            ifcols = ifschema.get_display_fields(columns)
            nbrcols = nbrschema.get_display_fields(columns)

        if state == "full":
            query_str = 'adjState == "full" or adjState == "passive"'
        elif state == "other":
            query_str = 'adjState != "full" and adjState != "passive"'
        elif state == "passive":
            query_str = 'adjState == "passive"'
        else:
            query_str = ''

        df = self.get_valid_df('ospfIf',
                               addnl_fields=addnl_fields,
                               columns=ifcols,
                               **kwargs)
        nbr_df = self.get_valid_df('ospfNbr',
                                   addnl_fields=addnl_nbr_fields,
                                   columns=nbrcols,
                                   **kwargs)
        if nbr_df.empty:
            return df

        merge_cols = [
            x for x in ['namespace', 'hostname', 'ifname']
            if x in nbr_df.columns
        ]
        # Merge the two tables
        df = df.merge(nbr_df, on=merge_cols, how='left')

        # This is because some NOS have the ipAddress in nbr table and some in
        # interface table. Nbr table wins over interface table if present
        if 'ipAddress_y' in df:
            df['ipAddress'] = np.where(df['ipAddress_y'] == "",
                                       df['ipAddress_x'], df['ipAddress_y'])
            df['ipAddress'] = np.where(df['ipAddress'], df['ipAddress'],
                                       df['ipAddress_x'])

        if columns == ['*']:
            df = df.drop(columns=['area_y', 'instance_y', 'vrf_y',
                                  'ipAddress_x', 'ipAddress_y', 'areaStub_y',
                                  'timestamp_y'], errors='ignore') \
                .rename(columns={
                    'instance_x': 'instance', 'areaStub_x': 'areaStub',
                    'area_x': 'area', 'vrf_x': 'vrf',
                    'state_x': 'ifState', 'state_y': 'adjState',
                    'sqvers_x': 'sqvers', 'active_x': 'active',
                    'timestamp_x': 'timestamp'})
        else:
            df = df.rename(
                columns={
                    'vrf_x': 'vrf',
                    'area_x': 'area',
                    'state_x': 'ifState',
                    'state_y': 'adjState',
                    'timestamp_x': 'timestamp'
                })
            df = df.drop(list(df.filter(regex='_y$')), axis=1) \
                   .drop('ipAddress_x', axis=1, errors='ignore') \
                   .fillna({'peerIP': '-', 'numChanges': 0,
                            'lastChangeTime': 0})

        # Fill the adjState column with passive if passive
        if 'passive' in df.columns:
            df.loc[df['adjState'].isnull(), 'adjState'] = df['passive']
            df.loc[df['adjState'].eq(True), 'adjState'] = 'passive'
            df.loc[df['adjState'].eq(False), 'adjState'] = 'fail'
            df.drop(columns=['passive'], inplace=True)

        df.bfill(axis=0, inplace=True)

        if 'peerHostname' in columns or (columns in [['*'], ['default']]):
            nfdf = df.query('adjState != "full"').reset_index()
            nfdf['peerHostname'] = ''
            newdf = df.query('adjState == "full"').reset_index() \
                .drop('peerHostname', axis=1, errors='ignore')
            if not newdf.empty:
                newdf['matchIP'] = newdf.ipAddress.str.split('/').str[0]
                newdf = newdf.merge(newdf[['namespace', 'hostname', 'vrf',
                                           'matchIP']],
                                    left_on=['namespace', 'vrf', 'peerIP'],
                                    right_on=['namespace', 'vrf', 'matchIP'],
                                    suffixes=["", "_y"]) \
                    .rename(columns={'hostname_y': 'peerHostname'}) \
                    .drop_duplicates(subset=['namespace', 'hostname',
                                             'vrf', 'ifname']) \
                    .drop(columns=['matchIP', 'matchIP_y'], errors='ignore')

                if newdf.empty:
                    newdf = df.query('adjState == "full"').reset_index()
                    newdf['peerHostname'] = ''
                final_df = pd.concat([nfdf, newdf])
            else:
                final_df = df
        else:
            final_df = df

        if query_str:
            final_df = final_df.query(query_str).reset_index(drop=True)

        if user_query and not final_df.empty:
            final_df = self._handle_user_query_str(final_df, user_query)
        # Move the timestamp column to the end
        return final_df[cols]
Example #7
0
class SqObject(object):
    def __init__(self,
                 engine_name: str = '',
                 hostname: typing.List[str] = [],
                 start_time: str = '',
                 end_time: str = '',
                 view: str = 'latest',
                 namespace: typing.List[str] = [],
                 columns: typing.List[str] = ['default'],
                 context=None,
                 table: str = '',
                 config_file=None) -> None:

        if context is None:
            self.ctxt = SqContext(engine_name, config_file)
        else:
            self.ctxt = context
            if not self.ctxt:
                self.ctxt = SqContext(engine_name)

        self._cfg = self.ctxt.cfg
        self._schema = SchemaForTable(table, self.ctxt.schemas)
        self._table = table
        self._sort_fields = self._schema.key_fields()

        if not namespace and self.ctxt.namespace:
            self.namespace = self.ctxt.namespace
        else:
            self.namespace = namespace
        if not hostname and self.ctxt.hostname:
            self.hostname = self.ctxt.hostname
        else:
            self.hostname = hostname

        if not start_time and self.ctxt.start_time:
            self.start_time = self.ctxt.start_time
        else:
            self.start_time = start_time

        if not end_time and self.ctxt.end_time:
            self.end_time = self.ctxt.end_time
        else:
            self.end_time = end_time

        if not view and self.ctxt.view:
            self.view = self.ctxt.view
        else:
            self.view = view
        self.columns = columns

        if engine_name and engine_name != '':
            self.engine = get_sqengine(engine_name)
        else:
            self.engine = self.ctxt.engine

        if self._table:
            self.engine_obj = self.engine.get_object(self._table, self)
        else:
            self.engine_obj = None

        self._addnl_filter = None
        self._addnl_fields = []

    @property
    def all_schemas(self):
        return self.ctxt.schemas

    @property
    def schema(self):
        return self._schema

    @property
    def cfg(self):
        return self._cfg

    @property
    def table(self):
        return self._table

    def validate_input(self, **kwargs):
        """Dummy validate input"""
        return

    def get(self, **kwargs) -> pd.DataFrame:
        if not self._table:
            raise NotImplementedError

        if not self.ctxt.engine:
            raise AttributeError('No analysis engine specified')

        if self._addnl_filter:
            kwargs['add_filter'] = self._addnl_filter

        # This raises exceptions if it fails
        try:
            self.validate_input(**kwargs)
        except Exception as error:
            df = pd.DataFrame({'error': [f'{error}']})
            return df

        return self.engine_obj.get(**kwargs)

    def summarize(self, namespace='') -> pd.DataFrame:
        if not self._table:
            raise NotImplementedError

        if not self.ctxt.engine:
            raise AttributeError('No analysis engine specified')

        return self.engine_obj.summarize(namespace=namespace)

    def unique(self, **kwargs) -> pd.DataFrame:
        if not self._table:
            raise NotImplementedError

        if not self.ctxt.engine:
            raise AttributeError('No analysis engine specified')

        return self.engine_obj.unique(**kwargs)

    def analyze(self, **kwargs):
        raise NotImplementedError

    def aver(self, **kwargs):
        raise NotImplementedError

    def top(self, what='', n=5, reverse=False, **kwargs) -> pd.DataFrame:
        """Get the list of top/bottom entries of "what" field"""

        if "columns" in kwargs:
            columns = kwargs["columns"]
            del kwargs["columns"]
        else:
            columns = ["default"]

        table_schema = SchemaForTable(self._table, self.all_schemas)
        columns = table_schema.get_display_fields(columns)

        if what not in columns:
            self._addnl_fields.append(what)

        return self.engine_obj.top(what=what, n=n, reverse=reverse, **kwargs)
Example #8
0
    def get_valid_df(self, table, **kwargs) -> pd.DataFrame:
        if not self.ctxt.engine:
            print("Specify an analysis engine using set engine command")
            return pd.DataFrame(columns=["namespace", "hostname"])

        sch = SchemaForTable(table, schema=self.schemas)
        phy_table = sch.get_phy_table_for_table()

        columns = kwargs.pop('columns', ['default'])
        addnl_fields = kwargs.pop('addnl_fields', [])
        view = kwargs.pop('view', self.iobj.view)
        active_only = kwargs.pop('active_only', True)
        query_str = kwargs.pop('query_str', '')

        # The REST API provides the query_str enclosed in ". Strip that
        if query_str:
            if query_str.startswith('"') and query_str.endswith('"'):
                query_str = query_str[1:-1]

        fields = sch.get_display_fields(columns)
        key_fields = sch.key_fields()
        drop_cols = []

        if columns == ['*']:
            drop_cols.append('sqvers')

        if 'timestamp' not in fields:
            fields.append('timestamp')

        if 'active' not in fields+addnl_fields:
            addnl_fields.append('active')
            drop_cols.append('active')

        for fld in key_fields:
            if fld not in fields+addnl_fields:
                addnl_fields.insert(0, fld)
                drop_cols.append(fld)

        for f in addnl_fields:
            if f not in fields:
                # timestamp is always the last field
                fields.insert(-1, f)

        if self.iobj.start_time:
            try:
                start_time = dateparser.parse(
                    self.iobj.start_time.replace('last night', 'yesterday')) \
                    .timestamp()*1000
            except Exception as e:
                print(f"ERROR: invalid time {self.iobj.start_time}: {e}")
                return pd.DataFrame()
        else:
            start_time = ''

        if self.iobj.start_time and not start_time:
            # Something went wrong with our parsing
            print(f"ERROR: unable to parse {self.iobj.start_time}")
            return pd.DataFrame()

        if self.iobj.end_time:
            try:
                end_time = dateparser.parse(
                    self.iobj.end_time.replace('last night', 'yesterday')) \
                    .timestamp()*1000
            except Exception as e:
                print(f"ERROR: invalid time {self.iobj.end_time}: {e}")
                return pd.DataFrame()
        else:
            end_time = ''

        if self.iobj.end_time and not end_time:
            # Something went wrong with our parsing
            print(f"ERROR: Unable to parse {self.iobj.end_time}")
            return pd.DataFrame()

        table_df = self._dbeng.read(
            phy_table,
            'pandas',
            start_time=start_time,
            end_time=end_time,
            columns=fields,
            view=view,
            key_fields=key_fields,
            **kwargs
        )

        if not table_df.empty:
            if view == 'latest' and active_only:
                table_df = table_df.query('active') \
                                   .drop(columns=drop_cols)
            else:
                table_df.drop(columns=drop_cols, inplace=True)
            if 'timestamp' in table_df.columns:
                table_df['timestamp'] = humanize_timestamp(
                    table_df.timestamp, self.cfg.get('analyzer', {})
                    .get('timezone', None))

        if query_str:
            return table_df.query(query_str)
        else:
            return table_df
Example #9
0
    def get_table_df(self, cfg, schemas, **kwargs) -> pd.DataFrame:
        """Use Pandas instead of Spark to retrieve the data"""

        MAX_FILECNT_TO_READ_FOLDER = 10000

        self.cfg = cfg

        table = kwargs.pop("table")
        start = kwargs.pop("start_time")
        end = kwargs.pop("end_time")
        view = kwargs.pop("view")
        sort_fields = kwargs.pop("sort_fields")
        ign_key_fields = kwargs.pop("ign_key", [])
        addnl_fields = kwargs.pop("addnl_fields", [])

        for f in ['active', 'timestamp']:
            if f not in addnl_fields:
                addnl_fields.append(f)

        sch = SchemaForTable(table, schema=schemas)
        phy_table = sch.get_phy_table_for_table()

        folder = self._get_table_directory(phy_table)

        # Restrict to a single DC if thats whats asked
        if "namespace" in kwargs:
            v = kwargs["namespace"]
            if v:
                if not isinstance(v, list):
                    folder += "/namespace={}/".format(v)

        fcnt = self.get_filecnt(folder)

        if fcnt == 0:
            return pd.DataFrame()

        # We are going to hard code use_get_files until we have some autoamted testing
        use_get_files = False

        # use_get_files = (
        #    (fcnt > MAX_FILECNT_TO_READ_FOLDER and view == "latest") or
        #    start or end
        # )

        if use_get_files:
            # Switch to more efficient method when there are lotsa files
            # Reduce I/O since that is the worst drag
            key_fields = []
            if len(kwargs.get("namespace", [])) > 1:
                del kwargs["namespace"]
            files = get_latest_files(folder, start, end, view)
        else:
            # ign_key_fields contains key fields that are not partition cols
            key_fields = [i for i in sch.key_fields()
                          if i not in ign_key_fields]
            filters = self.build_pa_filters(start, end, key_fields, **kwargs)

        if "columns" in kwargs:
            columns = kwargs["columns"]
            del kwargs["columns"]
        else:
            columns = ["default"]

        fields = sch.get_display_fields(columns)
        for f in addnl_fields:
            if f not in fields:
                fields.append(f)

        # Create the filter to select only specified columns
        query_str = ""
        prefix = ""
        addnl_filter = kwargs.pop('add_filter', None)
        for f, v in kwargs.items():
            if not v or f in key_fields or f in ["groupby"]:
                continue
            if isinstance(v, str):
                if v.startswith('!'):
                    v = v[1:]
                    op = '!='
                else:
                    op = '=='
                query_str += "{} {}{}'{}' ".format(prefix, f, op, v)
                prefix = "and"
            else:
                query_str += "{} {}=={} ".format(prefix, f, v)
                prefix = "and"

        # Add the ignored fields back to key fields to ensure we
        # do the drop_duplicates correctly below incl reading reqd cols
        key_fields.extend(ign_key_fields)

        # Handle the case where key fields are missing from display fields
        fldset = set(fields)
        kfldset = set(key_fields)
        add_flds = kfldset.difference(fldset)
        if add_flds:
            fields.extend(list(add_flds))

        if addnl_filter:
            # This is for special cases that are specific to an object
            if not query_str:
                query_str = addnl_filter
            else:
                query_str += ' and {}'.format(addnl_filter)

        # Restore the folder to what it needs to be
        folder = self._get_table_directory(phy_table)
        if use_get_files:
            if not query_str:
                query_str = "active == True"

            pdf_list = []
            with Executor(max_workers=8) as exe:
                jobs = [
                    exe.submit(self.read_pq_file, f, fields, query_str)
                    for f in files
                ]
                pdf_list = [job.result() for job in jobs]

            if pdf_list:
                final_df = pd.concat(pdf_list)
            else:
                final_df = pd.DataFrame(columns=fields)

        elif view == "latest":
            if not query_str:
                # Make up a dummy query string to avoid if/then/else
                query_str = "timestamp != 0"

            try:
                final_df = (
                    pa.ParquetDataset(
                        folder, filters=filters or None, validate_schema=False
                    )
                    .read(columns=fields)
                    .to_pandas(split_blocks=True, self_destruct=True)
                    .query(query_str)
                    .drop_duplicates(subset=key_fields, keep="last")
                    .query("active == True")
                )
            except pa.lib.ArrowInvalid:
                return pd.DataFrame(columns=fields)
        else:
            if not query_str:
                # Make up a dummy query string to avoid if/then/else
                query_str = 'timestamp != "0"'

            try:
                final_df = (
                    pa.ParquetDataset(
                        folder, filters=filters or None, validate_schema=False
                    )
                    .read(columns=fields)
                    .to_pandas()
                    .query(query_str)
                )
            except pa.lib.ArrowInvalid:
                return pd.DataFrame(columns=fields)

        if 'active' not in columns:
            final_df.drop(columns=['active'], axis=1, inplace=True)
            fields.remove('active')

        final_df = df_timestamp_to_datetime(final_df)

        if sort_fields and all(x in sort_fields for x in fields):
            return final_df[fields].sort_values(by=sort_fields)
        else:
            return final_df[fields]