Example #1
0
def add_checkpoint(checkpoint_name):
    """
    Create a new checkpoint with specified name, write all data required to restore the simulation
    to its current state.

    Detect any changed tables , re-wrap them and write the current version to the pipeline store.
    Write the current state of the random number generator.

    Parameters
    ----------
    checkpoint_name : str
    """
    timestamp = dt.datetime.now()

    logger.debug("add_checkpoint %s timestamp %s" %
                 (checkpoint_name, timestamp))

    for table_name in registered_tables():

        # if we have not already checkpointed it or it has changed
        # FIXME - this won't detect if the orca table was modified
        if len(orca.list_columns_for_table(table_name)):
            # rewrap the changed orca table as a unitary DataFrame-backed DataFrameWrapper table
            df = rewrap(table_name)
        elif table_name not in _PIPELINE.last_checkpoint or table_name in _PIPELINE.replaced_tables:
            df = orca.get_table(table_name).to_frame()
        else:
            continue

        logger.debug("add_checkpoint '%s' table '%s' %s" %
                     (checkpoint_name, table_name, util.df_size(df)))
        write_df(df, table_name, checkpoint_name)

        # remember which checkpoint it was last written
        _PIPELINE.last_checkpoint[table_name] = checkpoint_name

    _PIPELINE.replaced_tables.clear()

    _PIPELINE.last_checkpoint[CHECKPOINT_NAME] = checkpoint_name
    _PIPELINE.last_checkpoint[TIMESTAMP] = timestamp

    # append to the array of checkpoint history
    _PIPELINE.checkpoints.append(_PIPELINE.last_checkpoint.copy())

    # create a pandas dataframe of the checkpoint history, one row per checkpoint
    checkpoints = pd.DataFrame(_PIPELINE.checkpoints)

    # convert empty values to str so PyTables doesn't pickle object types
    for c in checkpoints.columns:
        checkpoints[c] = checkpoints[c].fillna('')

    # write it to the store, overwriting any previous version (no way to simply extend)
    write_df(checkpoints, CHECKPOINT_TABLE_NAME)
Example #2
0
def rewrap(table_name, df=None):
    """
    Add or replace an orca registered table as a unitary DataFrame-backed DataFrameWrapper table

    if df is None, then get the dataframe from orca (table_name should be registered, or
    an error will be thrown) which may involve evaluating added columns, etc.

    If the orca table already exists, deregister it along with any associated columns before
    re-registering it.

    The net result is that the dataframe is a registered orca DataFrameWrapper table with no
    computed or added columns.

    Parameters
    ----------
    table_name
    df

    Returns
    -------
        the underlying df of the rewrapped table
    """

    logger.debug("rewrap table %s inplace=%s" % (table_name, (df is None)))

    if orca.is_table(table_name):

        if df is None:
            # logger.debug("rewrap - orca.get_table(%s)" % (table_name,))
            t = orca.get_table(table_name)
            df = t.to_frame()
        else:
            # logger.debug("rewrap - orca.get_raw_table(%s)" % (table_name,))
            # don't trigger function call of TableFuncWrapper
            t = orca.get_raw_table(table_name)

        t.clear_cached()

        for column_name in orca.list_columns_for_table(table_name):
            # logger.debug("pop %s.%s: %s" % (table_name, column_name, t.column_type(column_name)))
            # fixme
            orca._COLUMNS.pop((table_name, column_name), None)

        # remove from orca's table list
        orca._TABLES.pop(table_name, None)

    assert df is not None

    orca.add_table(table_name, df)

    return df
Example #3
0
def extend_table(table_name, df, axis=0):
    """
    add new table or extend (add rows) to an existing table

    Parameters
    ----------
    table_name : str
        orca/inject table name
    df : pandas DataFrame
    """

    assert is_open(), f"Pipeline is not open."

    assert axis in [0, 1]

    if orca.is_table(table_name):

        table_df = orca.get_table(table_name).to_frame()

        if axis == 0:
            # don't expect indexes to overlap
            assert len(table_df.index.intersection(df.index)) == 0
            missing_df_str_columns = [
                c for c in table_df.columns
                if c not in df.columns and table_df[c].dtype == 'O'
            ]
        else:
            # expect indexes be same
            assert table_df.index.equals(df.index)
            new_df_columns = [
                c for c in df.columns if c not in table_df.columns
            ]
            df = df[new_df_columns]

        # preserve existing column order
        df = pd.concat([table_df, df], sort=False, axis=axis)

        # backfill missing df columns that were str (object) type in table_df
        if axis == 0:
            for c in missing_df_str_columns:
                df[c] = df[c].fillna('')

    replace_table(table_name, df)

    return df
Example #4
0
def get_table(table_name, checkpoint_name=None):
    """
    Return pandas dataframe corresponding to table_name

    if checkpoint_name is None, return the current (most recent) version of the table.
    The table can be a checkpointed table or any registered orca table (e.g. function table)

    if checkpoint_name is specified, return table as it was at that checkpoint
    (the most recently checkpointed version of the table at or before checkpoint_name)

    Parameters
    ----------
    table_name : str
    checkpoint_name : str or None

    Returns
    -------
    df : pandas.DataFrame
    """

    assert is_open(), f"Pipeline is not open."

    # orca table not in checkpoints (e.g. a merged table)
    if table_name not in _PIPELINE.last_checkpoint and orca.is_table(
            table_name):
        if checkpoint_name is not None:
            raise RuntimeError(
                "get_table: checkpoint_name ('%s') not supported"
                "for non-checkpointed table '%s'" %
                (checkpoint_name, table_name))

        return orca.get_table(table_name).to_frame()

    # if they want current version of table, no need to read from pipeline store
    if checkpoint_name is None:

        if table_name not in _PIPELINE.last_checkpoint:
            raise RuntimeError("table '%s' never checkpointed." % table_name)

        if not _PIPELINE.last_checkpoint[table_name]:
            raise RuntimeError("table '%s' was dropped." % table_name)

        # return orca.get_table(table_name).local
        return orca.get_table(table_name).to_frame()

    # find the requested checkpoint
    checkpoint = \
        next((x for x in _PIPELINE.checkpoints if x['checkpoint_name'] == checkpoint_name), None)
    if checkpoint is None:
        raise RuntimeError("checkpoint '%s' not in checkpoints." %
                           checkpoint_name)

    # find the checkpoint that table was written to store
    last_checkpoint_name = checkpoint.get(table_name, None)

    if not last_checkpoint_name:
        raise RuntimeError("table '%s' not in checkpoint '%s'." %
                           (table_name, checkpoint_name))

    # if this version of table is same as current
    if _PIPELINE.last_checkpoint.get(table_name, None) == last_checkpoint_name:
        return orca.get_table(table_name).to_frame()

    return read_df(table_name, last_checkpoint_name)
Example #5
0
def get_table(name, default=_NO_DEFAULT):

    if orca.is_table(name) or default == _NO_DEFAULT:
        return orca.get_table(name)
    else:
        return default