def add_checkpoint(checkpoint_name): """ Create a new checkpoint with specified name, write all data required to restore the simulation to its current state. Detect any changed tables , re-wrap them and write the current version to the pipeline store. Write the current state of the random number generator. Parameters ---------- checkpoint_name : str """ timestamp = dt.datetime.now() logger.debug("add_checkpoint %s timestamp %s" % (checkpoint_name, timestamp)) for table_name in registered_tables(): # if we have not already checkpointed it or it has changed # FIXME - this won't detect if the orca table was modified if len(orca.list_columns_for_table(table_name)): # rewrap the changed orca table as a unitary DataFrame-backed DataFrameWrapper table df = rewrap(table_name) elif table_name not in _PIPELINE.last_checkpoint or table_name in _PIPELINE.replaced_tables: df = orca.get_table(table_name).to_frame() else: continue logger.debug("add_checkpoint '%s' table '%s' %s" % (checkpoint_name, table_name, util.df_size(df))) write_df(df, table_name, checkpoint_name) # remember which checkpoint it was last written _PIPELINE.last_checkpoint[table_name] = checkpoint_name _PIPELINE.replaced_tables.clear() _PIPELINE.last_checkpoint[CHECKPOINT_NAME] = checkpoint_name _PIPELINE.last_checkpoint[TIMESTAMP] = timestamp # append to the array of checkpoint history _PIPELINE.checkpoints.append(_PIPELINE.last_checkpoint.copy()) # create a pandas dataframe of the checkpoint history, one row per checkpoint checkpoints = pd.DataFrame(_PIPELINE.checkpoints) # convert empty values to str so PyTables doesn't pickle object types for c in checkpoints.columns: checkpoints[c] = checkpoints[c].fillna('') # write it to the store, overwriting any previous version (no way to simply extend) write_df(checkpoints, CHECKPOINT_TABLE_NAME)
def rewrap(table_name, df=None): """ Add or replace an orca registered table as a unitary DataFrame-backed DataFrameWrapper table if df is None, then get the dataframe from orca (table_name should be registered, or an error will be thrown) which may involve evaluating added columns, etc. If the orca table already exists, deregister it along with any associated columns before re-registering it. The net result is that the dataframe is a registered orca DataFrameWrapper table with no computed or added columns. Parameters ---------- table_name df Returns ------- the underlying df of the rewrapped table """ logger.debug("rewrap table %s inplace=%s" % (table_name, (df is None))) if orca.is_table(table_name): if df is None: # logger.debug("rewrap - orca.get_table(%s)" % (table_name,)) t = orca.get_table(table_name) df = t.to_frame() else: # logger.debug("rewrap - orca.get_raw_table(%s)" % (table_name,)) # don't trigger function call of TableFuncWrapper t = orca.get_raw_table(table_name) t.clear_cached() for column_name in orca.list_columns_for_table(table_name): # logger.debug("pop %s.%s: %s" % (table_name, column_name, t.column_type(column_name))) # fixme orca._COLUMNS.pop((table_name, column_name), None) # remove from orca's table list orca._TABLES.pop(table_name, None) assert df is not None orca.add_table(table_name, df) return df
def extend_table(table_name, df, axis=0): """ add new table or extend (add rows) to an existing table Parameters ---------- table_name : str orca/inject table name df : pandas DataFrame """ assert is_open(), f"Pipeline is not open." assert axis in [0, 1] if orca.is_table(table_name): table_df = orca.get_table(table_name).to_frame() if axis == 0: # don't expect indexes to overlap assert len(table_df.index.intersection(df.index)) == 0 missing_df_str_columns = [ c for c in table_df.columns if c not in df.columns and table_df[c].dtype == 'O' ] else: # expect indexes be same assert table_df.index.equals(df.index) new_df_columns = [ c for c in df.columns if c not in table_df.columns ] df = df[new_df_columns] # preserve existing column order df = pd.concat([table_df, df], sort=False, axis=axis) # backfill missing df columns that were str (object) type in table_df if axis == 0: for c in missing_df_str_columns: df[c] = df[c].fillna('') replace_table(table_name, df) return df
def get_table(table_name, checkpoint_name=None): """ Return pandas dataframe corresponding to table_name if checkpoint_name is None, return the current (most recent) version of the table. The table can be a checkpointed table or any registered orca table (e.g. function table) if checkpoint_name is specified, return table as it was at that checkpoint (the most recently checkpointed version of the table at or before checkpoint_name) Parameters ---------- table_name : str checkpoint_name : str or None Returns ------- df : pandas.DataFrame """ assert is_open(), f"Pipeline is not open." # orca table not in checkpoints (e.g. a merged table) if table_name not in _PIPELINE.last_checkpoint and orca.is_table( table_name): if checkpoint_name is not None: raise RuntimeError( "get_table: checkpoint_name ('%s') not supported" "for non-checkpointed table '%s'" % (checkpoint_name, table_name)) return orca.get_table(table_name).to_frame() # if they want current version of table, no need to read from pipeline store if checkpoint_name is None: if table_name not in _PIPELINE.last_checkpoint: raise RuntimeError("table '%s' never checkpointed." % table_name) if not _PIPELINE.last_checkpoint[table_name]: raise RuntimeError("table '%s' was dropped." % table_name) # return orca.get_table(table_name).local return orca.get_table(table_name).to_frame() # find the requested checkpoint checkpoint = \ next((x for x in _PIPELINE.checkpoints if x['checkpoint_name'] == checkpoint_name), None) if checkpoint is None: raise RuntimeError("checkpoint '%s' not in checkpoints." % checkpoint_name) # find the checkpoint that table was written to store last_checkpoint_name = checkpoint.get(table_name, None) if not last_checkpoint_name: raise RuntimeError("table '%s' not in checkpoint '%s'." % (table_name, checkpoint_name)) # if this version of table is same as current if _PIPELINE.last_checkpoint.get(table_name, None) == last_checkpoint_name: return orca.get_table(table_name).to_frame() return read_df(table_name, last_checkpoint_name)
def get_table(name, default=_NO_DEFAULT): if orca.is_table(name) or default == _NO_DEFAULT: return orca.get_table(name) else: return default