def test_multiple_stattypes(): '''Test cgpm statistical types are heuristically converted to Loom types.''' cctypes, distargs = cu.parse_distargs([ 'normal', 'poisson', 'bernoulli', 'categorical(k=4)', 'lognormal', 'exponential', 'beta', 'geometric', 'vonmises' ]) T, Zv, Zc = tu.gen_data_table(200, [1], [[.25, .25, .5]], cctypes, distargs, [.95] * len(cctypes), rng=gu.gen_rng(10)) engine = Engine( T.T, cctypes=cctypes, distargs=distargs, rng=gu.gen_rng(15), num_states=16, ) logscore0 = engine.logpdf_score() engine.transition_loom(N=5) logscore1 = engine.logpdf_score() assert numpy.mean(logscore1) > numpy.mean(logscore0) # Check serializeation. metadata = engine.to_metadata() modname = importlib.import_module(metadata['factory'][0]) builder = getattr(modname, metadata['factory'][1]) engine2 = builder.from_metadata(metadata) # To JSON. json_metadata = json.dumps(engine.to_metadata()) engine3 = builder.from_metadata(json.loads(json_metadata)) # Assert all states in engine, engine2, and engine3 have same loom_path. loom_paths = list( itertools.chain.from_iterable([s._loom_path for s in e.states] for e in [engine, engine2, engine3])) assert all(p == loom_paths[0] for p in loom_paths) engine2.transition(S=5) dependence_probability = engine2.dependence_probability_pairwise() assert numpy.all(dependence_probability > 0.85)
def engine(): # Set up the data generation cctypes, distargs = cu.parse_distargs( ['normal', 'poisson', 'bernoulli', 'lognormal', 'beta', 'vonmises']) T, Zv, Zc = tu.gen_data_table(20, [1], [[.25, .25, .5]], cctypes, distargs, [.95] * len(cctypes), rng=gu.gen_rng(0)) T = T.T # Make some nan cells for constraints. T[5, 0] = T[5, 1] = T[5, 2] = T[5, 3] = np.nan T[8, 4] = np.nan e = Engine(T, cctypes=cctypes, distargs=distargs, num_states=6, rng=gu.gen_rng(0)) e.transition(N=2) return e.to_metadata()
def test_incorporate_engine(): engine = Engine( T[:,:2], cctypes=CCTYPES[:2], distargs=DISTARGS[:2], num_states=4, rng=gu.gen_rng(0), ) engine.transition(N=5) # Incorporate a new dim into with a non-contiguous output. engine.incorporate_dim( T[:,2], outputs=[10], cctype=CCTYPES[2], distargs=DISTARGS[2] ) engine.transition(N=2) # Serialize the engine, and run a targeted transtion on variable 10. m = engine.to_metadata() engine2 = Engine.from_metadata(m) engine2.transition(N=2, cols=[10], multiprocess=0) assert all(s.outputs == [0,1,10] for s in engine.states)
class Hierarchical_TRCRP_Mixture(object): """Hierarchical Temporally-Reweighted Chinese Restaurant Process Mixture. The data frame being modeled has an integer-valued index indicating the discrete time step, and has one column per time-varying variable, as shown below: +------+----------+----------+----------+ | Time | Var A | Var B | Var C | +======+==========+==========+==========+ | 1997 | 0.62 | 0.38 | 1.34 | +------+----------+----------+----------+ | 1998 | 0.82 | 0.23 | nan | +------+----------+----------+----------+ | 1999 | nan | 0.13 | 2.19 | +------+----------+----------+----------+ | 2000 | 1.62 | 0.22 | 1.70 | +------+----------+----------+----------+ | 2001 | 0.78 | 2.89 | nan | +------+----------+----------+----------+ Parameters ---------- chains: int Number of parallel MCMC chains to use for inference. lag : int Number of time points in the history to use for reweighting the CRP. If lag is zero, then all temporal dependencies are removed and the model becomes a standard CRP mixture. variables : list of str Human-readable names of the time series to be modeled. rng : numpy.random.RandomState Source of entropy. dependencies : list of tuple<string>, optional Blocks of variables which are deterministically constrained to be modeled jointly. Defaults to no deterministic constraints. """ def __init__(self, chains, lag, variables, rng, dependencies=None): """Initialize a Hierarchical TRCRP Mixture instance.""" # From constructor. self.chains = chains self.lag = lag self.variables = list(variables) self.rng = rng self.dependencies = dependencies # Derived attributes. self.window = self.lag + 1 self.variables_lagged = list( itertools.chain.from_iterable([[ '%s.lag.%d' % ( varname, i, ) for i in xrange(self.lag, -1, -1) ] for varname in self.variables])) self.variable_index = {var: i for i, var in enumerate(self.variables)} for variable in self.variables: variable_idx = self._variable_to_index(variable) assert self.variables_lagged[variable_idx] == '%s.lag.0' % ( variable, ) # Internal attributes. self.dataset = pd.DataFrame() self.engine = None self.initialized = None def incorporate(self, frame): """Incorporate new observations. Parameters ---------- frame : pd.DataFrame DataFrame containing new observations. The columns must match `self.variables`. """ assert set(frame.columns) == set(self.variables) self._incorporate_new_timepoints(frame) # XXX Improve this function. self._incorporate_existing_timepoints(frame) assert self.engine.states[0].n_rows() == len(self.dataset) def resample_all(self, steps=None, seconds=None): """Run MCMC inference on entire latent state Parameters ---------- steps : int, optional Number of full Gibbs sweeps through all kernels, default is 1. seconds : int, optional Maximum number of seconds to run inference steps before timing out, default is None. Notes ----- If both `steps` and `seconds` are specified, then the min is taken. That is, inference will run until the given number Gibbs steps are taken, or until the given number of seconds elapse, whichever comes first. """ self._transition(N=steps, S=seconds, backend='lovecat') def resample_hyperparameters(self, steps=None, seconds=None, variables=None): """Run empirical Bayes on variable hyperparameters. Parameters ---------- steps : int, optional Number of full Gibbs sweeps through all kernels, default is 1. seconds : int, optional Maximum number of seconds to run inference before timing out, default is None. variables : list of str List of time series variables whose hyperparameters to target, default is all. See Also -------- resample_all """ variables_transition = variables or self.variables variable_indexes = list( itertools.chain.from_iterable([ self._variable_to_window_indexes(v) for v in variables_transition ])) self._transition(N=steps, S=seconds, cols=variable_indexes, kernels=['view_alphas', 'column_hypers'], backend='cgpm') def simulate(self, timepoints, variables, nsamples, multiprocess=1): """Generate simulations from the posterior distribution. Parameters ---------- timepoints : list of int List of integer-valued time steps to simulate variables : list of str Names of time series which to simulate from. nsamples : int Number of predictive samples to generate from each chain. Returns ------- numpy.ndarray 3D array of generated samples. The dimensions of the returned list are `(self.chains*nsamples, len(timepoints), len(variables))`, so that `result[i][j][k]` contains a simulation of `variables[k],` at timepoint `j`, from chain `i`. A dissection of the output is shown below: .. code-block:: text # model has 2 chains, so chains * nsamples = 6 samples returned. >> model.simulate([1, 4], ['a', 'b'], 3) |<-----------chain 0----------->||<-----------chain 1----------->| [sample0.0, sample0.1, sample0.2, sample1.0, sample1.1, sample1.2] sample0.0: ((sim0.0_a1, sim0.0_b1), (sim0.0_a40, sim0.0_b40)) sample0.1: ((sim0.1_a1, sim0.1_b1), (sim0.1_a40, sim0.1_b40)) sample0.2: ((sim0.2_a1, sim0.2_b1), (sim0.2_a40, sim0.2_b40)) sample1.0: ((sim1.0_a1, sim1.0_b1), (sim1.0_a40, sim1.0_b40)) sample1.1: ((sim1.1_a1, sim1.1_b1), (sim1.1_a40, sim1.1_b40)) sample1.2: ((sim1.2_a1, sim1.2_b1), (sim1.2_a40, sim1.2_b40)) """ cgpm_rowids = [self._timepoint_to_rowid(t) for t in timepoints] constraints_list = [self._get_cgpm_constraints(t) for t in timepoints] targets = [self._variable_to_index(var) for var in variables] targets_list = [targets] * len(cgpm_rowids) Ns = [nsamples] * len(cgpm_rowids) samples_raw_bulk = self.engine.simulate_bulk(cgpm_rowids, targets_list, constraints_list, Ns=Ns, multiprocess=multiprocess) samples_raw = list( itertools.chain.from_iterable( zip(*sample) for sample in samples_raw_bulk)) samples = np.asarray([[[sample[t] for t in targets] for sample in sample_chain] for sample_chain in samples_raw]) return samples def simulate_ancestral(self, timepoints, variables, nsamples, multiprocess=1): """Generate simulations from the posterior distribution ancestrally. See Also -------- simulate """ assert timepoints == sorted(timepoints) targets = [self._variable_to_index(var) for var in variables] rowids = [self._timepoint_to_rowid(t) for t in timepoints] constraints = [self._get_cgpm_constraints(t) for t in timepoints] windows = { timepoint: set(self._get_timepoint_window(timepoint)) for timepoint in timepoints } parents = { timepoint: self._get_parents_from_windows(timepoint, windows) for timepoint in timepoints } args = [(state, timepoints, variables, rowids, targets, constraints, parents, self._variable_to_index, nsamples) for state in self.engine.states] mapper = parallel_map if multiprocess else map self.engine._seed_states() samples_raw_list = mapper(_simulate_ancestral_mp, args) samples_raw = itertools.chain.from_iterable(samples_raw_list) samples = np.asarray([[[sample[t][variable] for variable in targets] for t in timepoints] for sample in samples_raw]) return samples def dependence_probability_pairwise(self, variables=None): """Compute posterior dependence probabilities between time series. Parameters ---------- variables : list of str, optional List of time series variables to include in the returned array. Defaults to `self.variables`. Returns ------- numpy.ndarray 3D array containing pairwise dependence probabilities of time series `variables` from each chain. The dimensions of the returned array are `(self.chains, len(variables), len(variables))`, so that `result[i,j,k] == 1` if `variables[j]` and `variables[k]` are dependent according to chain `i`, and `0` otherwise. """ if variables is None: variables = self.variables varnos = [self._variable_to_index(var) for var in variables] D = self.engine.dependence_probability_pairwise(colnos=varnos) return np.asarray(D) def get_temporal_regimes(self, variable, timepoints=None): """Return latent temporal regime at `timepoints` of the given `variable`. Parameters ---------- variable : str Name of the time series variable to query. timepoints : list of int, optional List of timepoints at which to get the latent temporal regime value, defaults to all observed timepoints. Returns ------- numpy.ndarray 2D array containing latent temporal regime at `timepoints` of the given variable, for each chain. The dimensions of the returned array are `(self.chains, len(timepoints))`, where `result[i][t]` is the value of the hidden temporal regime at `timepoints[t]`, according to chain `i`. *Note*: The actual integer values of the regimes are immaterial. """ if timepoints is None: timepoints = self.dataset.index rowids = [self._timepoint_to_rowid(t) for t in timepoints] varno = self._variable_to_index(variable) regimes = [[state.view_for(varno).Zr(rowid) for rowid in rowids] for state in self.engine.states] return np.asarray(regimes) def _transition(self, **kwargs): """Helper for MCMC resample methods (full interface not exposed).""" if self.engine is None: raise ValueError('No data incorporate yet.') backend = kwargs.pop('backend', None) kwargs['cols'] = kwargs.pop('cols', self._variable_indexes()) if backend in ['cgpm', None]: self.engine.transition(**kwargs) elif backend in ['lovecat']: self.engine.transition_lovecat(**kwargs) elif backend in ['loom']: self.engine.transition_loom(**kwargs) else: raise ValueError('Unknown backend: %s' % (backend, )) def _incorporate_new_timepoints(self, frame): """Incorporate fresh sample ids as new cgpm rows.""" new_timepoints = frame.index[~frame.index.isin(self.dataset.index)] new_observations = frame[self.variables].loc[new_timepoints] self.dataset = self.dataset.append(new_observations) new_rows = [self._get_timepoint_row(t) for t in new_timepoints] if self.initialized: outputs = self.engine.states[0].outputs assert all(len(row) == len(outputs) for row in new_rows) rowids_cgpm = range(self.engine.states[0].n_rows(), self.engine.states[0].n_rows() + len(new_rows)) observations_cgpm = [{ i: row[i] for i in outputs if not np.isnan(row[i]) } for row in new_rows] assert all( rowid_cgpm == self._timepoint_to_rowid(timepoint) for timepoint, rowid_cgpm in zip(new_timepoints, rowids_cgpm)) self.engine.incorporate_bulk(rowids_cgpm, observations_cgpm) # XXX Do not initialize here! Instead, consider including a dummy row of # all zeros or similar. The reason that we initialize with the full # training set is to ensure that we have a good initial set of # hyperparameter grids. Instead, we should consider redefining the grids # after incorporating new data (a slight heuristic). else: self.engine = Engine( np.asarray(new_rows), num_states=self.chains, cctypes=['normal'] * len(self.variables_lagged), Cd=self._get_variable_dependence_constraints(), rng=self.rng, ) self.initialized = True def _incorporate_existing_timepoints(self, frame): """Update existing timepoints with NaN entries in cgpm cells.""" nan_mask = pd.isnull(self.dataset) & ~pd.isnull(frame) nan_mask = nan_mask[nan_mask.any(axis=1)] if len(nan_mask) == 0: return cgpm_rowids_cells = [] # For each new timepoint, get the cgpm rowids and cell values to force. for nan_timepoint, nan_timepoint_mask in nan_mask.iterrows(): self._update_dataset_nan_timepoint(frame, nan_timepoint, nan_timepoint_mask) timepoint_rowids_cells = \ self._convert_nan_timepoint_to_cgpm_rowid_cells( frame, nan_timepoint, nan_timepoint_mask) cgpm_rowids_cells.extend(timepoint_rowids_cells) # Force the cells in bulk. cgpm_rowids, cgpm_cells = zip(*cgpm_rowids_cells) self.engine.force_cell_bulk(cgpm_rowids, cgpm_cells) # XXX Also force any other sample ids which may have the new sample ids # in the window set at nan. Refer to the test case in # tests/test_data_transforms.test_incorporate_sampleid_wedged. def _update_dataset_nan_timepoint(self, frame, nan_timepoint, nan_timepoint_mask): """Populates timepoint with nan values in self.dataset using frame.""" nan_col_names = nan_timepoint_mask[nan_timepoint_mask].index nan_col_values = frame.loc[nan_timepoint, nan_col_names] self.dataset.loc[nan_timepoint, nan_col_names] = nan_col_values def _convert_nan_timepoint_to_cgpm_rowid_cells(self, frame, nan_timepoint, nan_timepoint_mask): """Returns the cgpm rowid of all windows that nan_timepoint participates in, and dict containing columns and values to populate.""" nan_col_names = nan_timepoint_mask[nan_timepoint_mask].index nan_col_idxs = [self._variable_to_index(col) for col in nan_col_names] nan_col_values = frame.loc[nan_timepoint, nan_col_names].as_matrix() cgpm_rowids = self._timepoint_to_rowids(nan_timepoint) cgpm_rowids_cells = [{ col_idx - lag: value for col_idx, value in zip(nan_col_idxs, nan_col_values) } if rowid is not None else None for lag, rowid in enumerate(cgpm_rowids)] return [(rowid, cells) for rowid, cells in zip(cgpm_rowids, cgpm_rowids_cells) if rowid is not None] def _get_parents_from_windows(self, timepoint, windows): """Return list of timepoints of parents of the given timepoint.""" return [ timepoint2 for timepoint2 in windows if timepoint2 != timepoint and timepoint in windows[timepoint2] ] def _timepoint_to_rowid(self, timepoint): """Return the cgpm rowid representing the timepoint.""" try: return self.dataset.index.get_loc(timepoint) except KeyError: return None def _timepoint_to_rowids(self, timepoint): """Return the list of cgpm rowids that timepoint participates in.""" # Assuming self.window = 3, the first cgpm rowid that timepoint of value # 13 participates in is the rowid of timepoint, and the last cgpm rowid # is the rowid of timepoint+lag. # Example: # lag L2,L1,L0 # rowid=7 11,12,13 # rowid=8 12,13,14 # rowid=9 13,14,15 timepoints_window = self._get_timepoint_window(timepoint) return [self._timepoint_to_rowid(t) for t in timepoints_window] def _get_timepoint_window(self, timepoint): """Return the previous timepoints in the window of this timepoint.""" return range(timepoint, timepoint + self.window) def _variable_to_index(self, variable, lag=0): """Convert variable name to cgpm output index.""" assert 0 <= lag <= self.lag return self.variable_index[variable] * self.window + (self.lag - lag) def _variable_to_window_indexes(self, variable): """Convert variable name to list of cgpm output indexes in its window.""" return [ self._variable_to_index(variable, l) for l in xrange(self.window) ] def _variable_indexes(self): """Return list of cgpm output indexes, one per variable at lag 0.""" return [self._variable_to_index(var) for var in self.variables] def _get_variable_dependence_constraints(self): """Ensure lagged columns and user constraints are modeled as a block.""" cgpm_dependencies = self._make_dependencies(self.dependencies) dependencies = [ list( itertools.chain.from_iterable( [self._variable_to_window_indexes(c) for c in block])) for block in cgpm_dependencies ] # Filter out any singleton dependencies. return [colnos for colnos in dependencies if len(colnos) > 1] def _get_cgpm_constraints(self, timepoint): # An already incorporated timepoint requires no constraints. if timepoint in self.dataset.index: return None # Retrieve existing observations in window of a fresh timepoint. row_values = self._get_timepoint_row(timepoint) assert len(row_values) == len(self.variables_lagged) # XXX Require user to specify columns to ignore. return {i: v for i, v in enumerate(row_values) if not np.isnan(v)} def _get_timepoint_row(self, timepoint): """Convert timepoint to row representation with timepoint at lag0.""" timepoints_lag = range(timepoint - self.lag, timepoint + 1) return list( itertools.chain.from_iterable( (self.dataset[col].get(t, float('nan')) for t in timepoints_lag) for col in self.variables)) def _make_dependencies(self, dependencies): """Return combination of default and user's dependence constraints.""" if dependencies is None: dependencies = [] seen = set(col for block in dependencies for col in block) deps_default = [[col] for col in self.variables if col not in seen] deps_external = [block for block in dependencies] return tuple(tuple(itertools.chain(deps_default, deps_external))) def to_metadata(self): """Return a JSON representation that can be saved to disk. The typical usage pattern for serializing `model` and deserializing it into `model2` is: .. code-block:: python >> import importlib >> metadata = model.to_metadata() >> modname, attrname = metadata['factory'] >> module = importlib.import_module(modname) >> klass = getattr(module, attrname) >> model2 = klass.from_metadata(binary) """ metadata = dict() # From constructor. metadata['chains'] = self.chains metadata['lag'] = self.lag metadata['variables'] = self.variables metadata['dependencies'] = self.dependencies # Internal fields. metadata['initialized'] = self.initialized metadata['engine'] = self.engine.to_metadata() \ if self.initialized else None metadata['dataset.values'] = self.dataset.values.tolist() metadata['dataset.index'] = list(self.dataset.index) metadata['dataset.columns'] = list(self.dataset.columns) # Factory. metadata['factory'] = ('trcrpm', 'Hierarchical_TRCRP_Mixture') return metadata @classmethod def from_metadata(cls, metadata, seed): """Load object from its JSON representation. Parameters ---------- metadata : json blob JSON blob return from call to :meth:`to_metadata`. seed : int Seed for the random number generator to use. See Also -------- to_metadata """ model = cls( chains=metadata['chains'], lag=metadata['lag'], variables=metadata['variables'], rng=np.random.RandomState(seed), dependencies=metadata['dependencies'], ) # Return model with populated internal fields. return model._populate_from_metadata(model, metadata) @staticmethod def _populate_from_metadata(model, metadata): model.initialized = metadata['initialized'] model.dataset = pd.DataFrame(metadata['dataset.values'], index=metadata['dataset.index'], columns=metadata['dataset.columns']) model.engine = Engine.from_metadata(metadata['engine']) \ if model.initialized else None return model