Exemple #1
0
def extend_table(table_name, df):
    """
    add new table or extend (add rows) to an existing table

    Parameters
    ----------
    table_name : str
        orca/inject table name
    df : pandas DataFrame
    """

    if orca.is_table(table_name):

        extend_df = orca.get_table(table_name).to_frame()

        # don't expect indexes to overlap
        assert len(extend_df.index.intersection(df.index)) == 0

        # preserve existing column order (concat reorders columns)
        columns = list(extend_df.columns) + [
            c for c in df.columns if c not in extend_df.columns
        ]

        df = pd.concat([extend_df, df])[columns]

    replace_table(table_name, df)
Exemple #2
0
def assert_table_not_registered(table_name):
    """
    """
    if orca.is_table(table_name):
        msg = "Table '%s' is already registered" % table_name
        raise OrcaAssertionError(msg)
    return
Exemple #3
0
def add_table(table_name, table, cache=False):

    if orca.is_table(table_name):
        logger.warn("inject add_table replacing existing table %s" %
                    table_name)

    return orca.add_table(table_name, table, cache=cache)
Exemple #4
0
def drop_table(table_name):

    if orca.is_table(table_name):

        logger.debug("drop_table dropping orca table '%s'" % table_name)

        # don't trigger function call of TableFuncWrapper
        t = orca.get_raw_table(table_name)
        t.clear_cached()

        for column_name in orca.list_columns_for_table(table_name):
            # logger.debug("pop %s.%s: %s" % (table_name, column_name, t.column_type(column_name)))
            orca.orca._COLUMNS.pop((table_name, column_name), None)

        # remove from orca's table list
        orca.orca._TABLES.pop(table_name, None)

    if table_name in _PIPELINE.replaced_tables:

        logger.debug("drop_table forgetting replaced_tables '%s'" % table_name)
        del _PIPELINE.replaced_tables[table_name]

    if table_name in _PIPELINE.last_checkpoint:

        logger.debug("drop_table removing table %s from last_checkpoint" % table_name)

        _PIPELINE.last_checkpoint[table_name] = ''
Exemple #5
0
def assert_table_is_registered(table_name):
    """
    Has a table name been registered with orca?
    """
    if not orca.is_table(table_name):
        msg = "Table '%s' is not registered" % table_name
        raise OrcaAssertionError(msg)
    return
Exemple #6
0
    def wrapper(**kwargs):
        table_name = kwargs['table_name']
        col_name = kwargs['col_name']
        if not orca.is_table(table_name):
            abort(404)
        if col_name not in orca.get_table(table_name).columns:
            abort(404)

        return func(**kwargs)
Exemple #7
0
def get_table(table_name, checkpoint_name=None):
    """
    Return pandas dataframe corresponding to table_name

    if checkpoint_name is None, return the current (most recent) version of the table.
    The table can be a checkpointed table or any registered orca table (e.g. function table)

    if checkpoint_name is specified, return table as it was at that checkpoint
    (the most recently checkpointed version of the table at or before checkpoint_name)

    Parameters
    ----------
    table_name : str
    checkpoint_name : str or None

    Returns
    -------
    df : pandas.DataFrame
    """

    # orca table not in checkpoints (e.g. a merged table)
    if table_name not in _PIPELINE.last_checkpoint and orca.is_table(table_name):
        if checkpoint_name is not None:
            raise RuntimeError("get_table: checkpoint_name ('%s') not supported"
                               "for non-checkpointed table '%s'" % (checkpoint_name, table_name))

        return orca.get_table(table_name).to_frame()

    # if they want current version of table, no need to read from pipeline store
    if checkpoint_name is None:

        if table_name not in _PIPELINE.last_checkpoint:
            raise RuntimeError("table '%s' never checkpointed." % table_name)

        if not _PIPELINE.last_checkpoint[table_name]:
            raise RuntimeError("table '%s' was dropped." % table_name)

        # return orca.get_table(table_name).local
        return orca.get_table(table_name).to_frame()

    # find the requested checkpoint
    checkpoint = \
        next((x for x in _PIPELINE.checkpoints if x['checkpoint_name'] == checkpoint_name), None)
    if checkpoint is None:
        raise RuntimeError("checkpoint '%s' not in checkpoints." % checkpoint_name)

    # find the checkpoint that table was written to store
    last_checkpoint_name = checkpoint.get(table_name, None)

    if not last_checkpoint_name:
        raise RuntimeError("table '%s' not in checkpoint '%s'." % (table_name, checkpoint_name))

    # if this version of table is same as current
    if _PIPELINE.last_checkpoint.get(table_name, None) == last_checkpoint_name:
        return orca.get_table(table_name).to_frame()

    return read_df(table_name, last_checkpoint_name)
Exemple #8
0
    def wrapper(**kwargs):
        table_name = kwargs["table_name"]
        col_name = kwargs["col_name"]
        if not orca.is_table(table_name):
            abort(404)
        if col_name not in orca.get_table(table_name).columns:
            abort(404)

        return func(**kwargs)
Exemple #9
0
def add_table(table_name, table, replace=False):
    """
    Add new table and raise assertion error if the table already exists.
    Silently replace if replace=True.
    """
    if not replace and orca.is_table(table_name) and orca.table_type(table_name) == 'dataframe':
        logger.warning("inject add_table replacing existing table %s" % table_name)
        assert False

    # FIXME - should add table.copy() instead, so it can't be modified behind our back?
    return orca.add_table(table_name, table, cache=False)
Exemple #10
0
def rewrap(table_name, df=None):
    """
    Add or replace an orca registered table as a unitary DataFrame-backed DataFrameWrapper table

    if df is None, then get the dataframe from orca (table_name should be registered, or
    an error will be thrown) which may involve evaluating added columns, etc.

    If the orca table already exists, deregister it along with any associated columns before
    re-registering it.

    The net result is that the dataframe is a registered orca DataFrameWrapper table with no
    computed or added columns.

    Parameters
    ----------
    table_name
    df

    Returns
    -------
        the underlying df of the rewrapped table
    """

    logger.debug("rewrap table %s inplace=%s" % (table_name, (df is None)))

    if orca.is_table(table_name):

        if df is None:
            logger.debug("rewrap - orca.get_table(%s)" % (table_name, ))
            t = orca.get_table(table_name)
            df = t.to_frame()
        else:
            logger.debug("rewrap - orca.get_raw_table(%s)" % (table_name, ))
            # don't trigger function call of TableFuncWrapper
            t = orca.get_raw_table(table_name)

        t.clear_cached()

        for column_name in orca.list_columns_for_table(table_name):
            # logger.debug("pop %s.%s: %s" % (table_name, column_name, t.column_type(column_name)))
            orca.orca._COLUMNS.pop((table_name, column_name), None)

        # remove from orca's table list
        orca.orca._TABLES.pop(table_name, None)

    assert df is not None

    logger.debug("rewrap - orca.add_table(%s)" % (table_name, ))
    orca.add_table(table_name, df)

    return df
Exemple #11
0
def get_table(table_name, checkpoint_name=None):
    """
    Return pandas dataframe corresponding to table_name

    if checkpoint_name is None, return the current (most recent) version of the table.
    The table can be a checkpointed table or any registered orca table (e.g. function table)

    if checkpoint_name is specified, return table as it was at that checkpoint
    (the most recently checkpointed version of the table at or before checkpoint_name)

    Parameters
    ----------
    table_name : str
    checkpoint_name : str or None

    Returns
    -------
    df : pandas.DataFrame
    """

    # orca table not in checkpoints (e.g. a merged table)
    if table_name not in _PIPELINE.last_checkpoint and orca.is_table(
            table_name):
        if checkpoint_name is not None:
            raise RuntimeError(
                "get_table: checkpoint_name ('%s') not supported"
                "for non-checkpointed table '%s'" %
                (checkpoint_name, table_name))

        return orca.get_table(table_name).to_frame()

    # was table ever checkpointed?
    if table_name not in checkpointed_tables():
        raise RuntimeError("table '%s' not in checkpointed tables." %
                           table_name)

    # if they want current version of table, no need to read from pipeline store
    if checkpoint_name is None or _PIPELINE.last_checkpoint[
            table_name] == checkpoint_name:
        # return orca.get_table(table_name).local
        return orca.get_table(table_name).to_frame()

    if checkpoint_name not in [
            checkpoint[CHECKPOINT_NAME] for checkpoint in _PIPELINE.checkpoints
    ]:
        raise RuntimeError("checkpoint '%s' not in checkpoints." %
                           checkpoint_name)

    return read_df(table_name, checkpoint_name)
Exemple #12
0
    def load_channels(self, saved_channels):
        """
        Load the channels listed in saved_channels

        The saved_channels list is a list of channel states created by get_channels and
        saved by the pipeline manager at a checkpoint.

        This channel state information allows us to restore the channels to the same state
        as they were when checkpointed so that the random number streams will can be resumed.

        Note that we assume that the channel names correspond to orca table names, so that
        we can get the domain_df for that channel from orca.

        Since tours are originally created in two tables (mandatory and non-mandatory) we get the
        domain_dfs from them because the checkpoint may have occurred when only one of those
        tables had been created and the tours table may not exist yet.

        Parameters
        ----------
        saved_channels : array of SavedChannelState
        """

        for channel_state in saved_channels:

            channel_name = channel_state.channel_name
            assert channel_name in self.channel_info

            # FIXME - this rigamarole is here to support the tours channel two component tables
            table_names = self.get_channel_info(channel_name, 'table_names')

            logger.debug("loading channel %s from %s" %
                         (channel_state.channel_name, table_names))

            logger.debug("channel_state %s" % (channel_state, ))

            for table_name in table_names:
                if orca.is_table(table_name):
                    df = orca.get_table(table_name).local
                    self.add_channel(df,
                                     channel_name=channel_state.channel_name,
                                     step_num=channel_state.step_num,
                                     step_name=channel_state.step_name)
Exemple #13
0
 def wrapper(**kwargs):
     if not orca.is_table(kwargs['table_name']):
         abort(404)
     return func(**kwargs)
Exemple #14
0
def get_table(name, default=_NO_DEFAULT):

    if orca.is_table(name) or default == _NO_DEFAULT:
        return orca.get_table(name)
    else:
        return default
Exemple #15
0
    def validate(self):
        """
        Check some basic expectations about the table generated by the step:
        
        - Confirm that the table includes a unique, named index column (primary key) or 
          set of columns (composite key). If not, raise a ValueError.
        
        - If the table contains columns whose names match the index columns of tables
          previously registered with Orca, check whether they make sense as join keys.
          Print a status message with the number of presumptive foreign-key values that 
          are found in the primary key column. 
        
        - Perform the same check for columns in previously registered tables whose names
          match the index of the table generated by this step.
          
        - It doesn't currently compare indexes to indexes. (Maybe it should?)
          
        Running this will trigger loading all registered Orca tables into memory, which 
        may take a while if they have not yet been loaded. Stand-alone columns will not 
        be loaded unless their names match an index column. 
        
        Returns
        -------
        bool
        
        """
        # There are a couple of reasons we're not using the orca_test library here:
        # (a) orca_test doesn't currently support MultiIndexes, and (b) the primary-key/
        # foreign-key comparisons aren't asserting anything, just printing status
        # messages. We should update orca_test to support both, probably.

        # Register table if needed
        if not orca.is_table(self.table):
            self.run()

        idx = orca.get_table(self.table).index

        # Check index has a name
        if list(idx.names) == [None]:
            raise ValueError("Index column has no name")

        # Check index is unique
        if len(idx.unique()) < len(idx):
            raise ValueError("Index not unique")

        # Compare columns to indexes of other tables, and vice versa
        combinations = [(self.table, t) for t in orca.list_tables() if self.table != t] \
                + [(t, self.table) for t in orca.list_tables() if self.table != t]

        for t1, t2 in combinations:
            col_names = orca.get_table(t1).columns
            idx = orca.get_table(t2).index

            if set(idx.names).issubset(col_names):
                vals = orca.get_table(t1).to_frame(idx.names).drop_duplicates()

                # Easier to compare multi-column values to multi-column index if we
                # turn the values into an index as well
                vals = vals.reset_index().set_index(idx.names).index
                vals_in_idx = sum(vals.isin(idx))

                if len(idx.names) == 1:
                    idx_str = idx.names[0]
                else:
                    idx_str = '[{}]'.format(','.join(idx.names))

                print("'{}.{}': {} of {} unique values are found in '{}.{}' ({}%)"\
                        .format(t1, idx_str,
                                vals_in_idx, len(vals),
                                t2, idx_str,
                                round(100*vals_in_idx/len(vals))))

        return True
Exemple #16
0
def validate_table(table, reciprocal=True):
    """
    Check some basic expectations about an Orca table:
    
    - Confirm that it includes a unique, named index column (a.k.a. primary key) or set 
      of columns (multi-index, a.k.a. composite key). If not, raise a ValueError.
    
    - Confirm that none of the other columns in the table share names with the index(es). 
      If they do, raise a ValueError.
    
    - If the table contains columns whose names match the index columns of other tables 
      registered with Orca, check whether they make sense as join keys. This prints a 
      status message with the number of presumptive foreign-key values that are found in 
      the primary/composite key, for evaluation by the user. 
    
    - Perform the same check for columns in _other_ tables whose names match the index 
      column(s) of _this_ table.
      
    - It doesn't currently compare indexes to indexes. (Maybe it should?)
      
    Running this will trigger loading all registered Orca tables, which may take a while. 
    Stand-alone columns will not be loaded unless their names match an index column. 
    
    Doesn't currently incorporate ``orca_test`` validation, but it might be added.
    
    Parameters
    ----------
    table : str
        Name of Orca table to validate.
    
    reciprocal : bool, default True
        Whether to also check how columns of other tables align with this one's index. 
        If False, only check this table's columns against other tables' indexes. 
    
    Returns
    -------
    bool
    
    """
    # There are a couple of reasons we're not using the orca_test library here:
    # (a) orca_test doesn't currently support MultiIndexes, and (b) the primary-key/
    # foreign-key comparisons aren't asserting anything, just printing status
    # messages. We should update orca_test to support both, probably.

    if not orca.is_table(table):
        raise ValueError("Table not registered with Orca: '{}'".format(table))

    idx = orca.get_table(table).index

    # Check index has a name
    if list(idx.names) == [None]:
        raise ValueError("Index column has no name")

    # Check for unique column names
    for name in list(idx.names):
        if name in list(orca.get_table(table).columns):
            raise ValueError(
                "Index names and column names overlap: '{}'".format(name))

    # Check for unique index values
    if len(idx.unique()) < len(idx):
        raise ValueError("Index not unique")

    # Compare columns to indexes of other tables, and vice versa
    combinations = [(table, t) for t in orca.list_tables() if table != t]

    if reciprocal:
        combinations += [(t, table) for t in orca.list_tables() if table != t]

    for t1, t2 in combinations:
        col_names = orca.get_table(t1).columns
        idx = orca.get_table(t2).index

        if set(idx.names).issubset(col_names):
            vals = orca.get_table(t1).to_frame(idx.names).drop_duplicates()

            # Easier to compare multi-column values to multi-column index if we
            # turn the values into an index as well
            vals = vals.reset_index().set_index(idx.names).index
            vals_in_idx = sum(vals.isin(idx))

            if len(idx.names) == 1:
                idx_str = idx.names[0]
            else:
                idx_str = '[{}]'.format(','.join(idx.names))

            print("'{}.{}': {} of {} unique values are found in '{}.{}' ({}%)"\
                    .format(t1, idx_str,
                            vals_in_idx, len(vals),
                            t2, idx_str,
                            round(100*vals_in_idx/len(vals))))

    return True
Exemple #17
0
 def wrapper(**kwargs):
     if not orca.is_table(kwargs["table_name"]):
         abort(404)
     return func(**kwargs)