def _incorporate_new_timepoints(self, frame): """Incorporate fresh sample ids as new cgpm rows.""" new_timepoints = frame.index[~frame.index.isin(self.dataset.index)] new_observations = frame[self.variables].loc[new_timepoints] self.dataset = self.dataset.append(new_observations) new_rows = [self._get_timepoint_row(t) for t in new_timepoints] if self.initialized: outputs = self.engine.states[0].outputs assert all(len(row) == len(outputs) for row in new_rows) rowids_cgpm = range(self.engine.states[0].n_rows(), self.engine.states[0].n_rows() + len(new_rows)) observations_cgpm = [{ i: row[i] for i in outputs if not np.isnan(row[i]) } for row in new_rows] assert all( rowid_cgpm == self._timepoint_to_rowid(timepoint) for timepoint, rowid_cgpm in zip(new_timepoints, rowids_cgpm)) self.engine.incorporate_bulk(rowids_cgpm, observations_cgpm) # XXX Do not initialize here! Instead, consider including a dummy row of # all zeros or similar. The reason that we initialize with the full # training set is to ensure that we have a good initial set of # hyperparameter grids. Instead, we should consider redefining the grids # after incorporating new data (a slight heuristic). else: self.engine = Engine( np.asarray(new_rows), num_states=self.chains, cctypes=['normal'] * len(self.variables_lagged), Cd=self._get_variable_dependence_constraints(), rng=self.rng, ) self.initialized = True
def _incorporate_new_timepoints(self, frame): """Incorporate fresh sample ids as new cgpm rows.""" new_timepoints = frame.index[~frame.index.isin(self.dataset.index)] new_observations = frame[self.variables].loc[new_timepoints] self.dataset = self.dataset.append(new_observations) new_rows = [self._get_timepoint_row(t) for t in new_timepoints] if self.initialized: outputs = self.engine.states[0].outputs for row, timepoint in zip(new_rows, new_timepoints): rowid_cgpm = self.engine.states[0].n_rows() assert len(row) == len(outputs) assert rowid_cgpm == self._timepoint_to_rowid(timepoint) row_cgpm = {i: row[i] for i in outputs if not np.isnan(row[i])} self.engine.incorporate(rowid_cgpm, row_cgpm) # XXX Do not initialize here! Instead, consider including a dummy row of # all zeros or something. The reason that we initialize with the full # training set is to ensure that we have a good initial set of # hyperparameter grids. else: self.engine = Engine( np.asarray(new_rows), num_states=self.chains, cctypes=['normal'] * len(self.variables_lagged), Cd=self._get_variable_dependence_constraints(), rng=self.rng, ) self.initialized = True
def test_engine_simulate_no_repeat(): """Generate 3 samples from 2 states 10 times, and ensure uniqueness.""" rng = gu.gen_rng(1) engine = Engine(X=[[1]], cctypes=['normal'], num_states=2, rng=rng) samples_list = [[ sample[0] for sample in engine.simulate(None, [0], N=3)[0] ] for _i in xrange(10)] samples_set = set([frozenset(s) for s in samples_list]) assert len(samples_set) == len(samples_list)
def compare_dependence_heatmap(): e1 = Engine.from_pickle('resources/animals/animals.engine') e2 = Engine.from_pickle('resources/animals/animals-lovecat.engine') D1 = e1.dependence_probability_pairwise() D2 = e2.dependence_probability_pairwise() C1 = pu.plot_clustermap(D1) ordering = C1.dendrogram_row.reordered_ind fig, ax = plt.subplots(nrows=1, ncols=2) pu.plot_heatmap(D1, xordering=ordering, yordering=ordering, ax=ax[0]) pu.plot_heatmap(D2, xordering=ordering, yordering=ordering, ax=ax[1])
def test_two_views_column_partition_normal__ci_(lovecat): D = retrieve_normal_dataset() engine = Engine(D.T, outputs=[5, 0, 1, 2, 3, 4], cctypes=['normal'] * len(D), rng=gu.gen_rng(12), num_states=64) if lovecat: engine.transition_lovecat(N=200) else: engine.transition(N=200) P = engine.dependence_probability_pairwise() R1 = engine.row_similarity_pairwise(cols=[5, 0, 1]) R2 = engine.row_similarity_pairwise(cols=[2, 3, 4]) pu.plot_clustermap(P) pu.plot_clustermap(R1) pu.plot_clustermap(R2) P_THEORY = [ [1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0], [1, 1, 1, 0, 0, 0], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], ] return engine
def test_logpdf_score_crash(): rng = gen_rng(8) # T = rng.choice([0,1], p=[.3,.7], size=250).reshape(-1,1) T = rng.normal(size=30).reshape(-1, 1) engine = Engine(T, cctypes=['normal'], rng=rng, num_states=4) logpdf_likelihood_initial = np.array(engine.logpdf_likelihood()) logpdf_score_initial = np.array(engine.logpdf_score()) assert np.all(logpdf_score_initial < logpdf_likelihood_initial) # assert np.all(logpdf_likelihood_initial < logpdf_score_initial) engine.transition(N=100) engine.transition(kernels=['column_hypers', 'view_alphas'], N=10) logpdf_likelihood_final = np.asarray(engine.logpdf_likelihood()) logpdf_score_final = np.asarray(engine.logpdf_score()) assert np.all(logpdf_score_final < logpdf_likelihood_final) assert np.max(logpdf_score_initial) < np.max(logpdf_score_final)
def _engine(self, bdb, generator_id): # Probe the cache. cache = self._cache(bdb) if cache is not None and generator_id in cache.engine: return cache.engine[generator_id] # Not cached. Load the engine from the database. cursor = bdb.sql_execute( ''' SELECT engine_json FROM bayesdb_cgpm_generator WHERE generator_id = ? ''', (generator_id, )) engine_json = cursor_value(cursor) if engine_json is None: generator = core.bayesdb_generator_name(bdb, generator_id) raise BQLError( bdb, 'No models initialized for generator: %r' % (generator, )) # Deserialize the engine. engine = Engine.from_metadata(json.loads(engine_json), rng=bdb.np_prng, multiprocess=self._ncpu) # Cache it, if we can. if cache is not None: cache.engine[generator_id] = engine return engine
def test_logpdf_bulk__ci_(engine): engine = Engine.from_metadata(engine) rowid1, targets1, constraints1 = 5, {0: 0}, {2: 1, 3: .5} rowid2, targets2, constraints2 = -1, {1: 0, 4: .8}, {5: .5} # Bulk. rowids = [rowid1, rowid2] targets_list = [targets1, targets2] constraints_list = [constraints1, constraints2] def test_correct_dimensions(statenos): # Invoke logpdfs = engine.logpdf_bulk(rowids, targets_list, constraints_list=constraints_list, statenos=statenos) assert len(logpdfs) == \ engine.num_states() if statenos is None else len(statenos) for state_logpdfs in logpdfs: # state_logpdfs should be a list of floats, one float per targets. assert len(state_logpdfs) == len(rowids) for l in state_logpdfs: assert isinstance(l, float) test_correct_dimensions(statenos=None) test_correct_dimensions(statenos=[0, 1, 4, 5])
def engine(): # Set up the data generation cctypes, distargs = cu.parse_distargs([ 'normal', 'poisson', 'bernoulli', 'categorical(k=4)', 'lognormal', 'exponential', 'beta', 'geometric', 'vonmises', ]) T, Zv, Zc = tu.gen_data_table(20, [1], [[.25, .25, .5]], cctypes, distargs, [.95] * len(cctypes), rng=gu.gen_rng(10)) return Engine(T.T, cctypes=cctypes, distargs=distargs, num_states=4, rng=gu.gen_rng(312), multiprocess=False)
def test_simulate_bulk__ci_(engine): engine = Engine.from_metadata(engine) rowid1, targets1, constraints1, N1, = -1, [0, 2, 4, 5], {3: 1}, 7 rowid2, targets2, constraints2, N2 = 5, [1, 3], {2: 1}, 3 rowid3, targets3, constraints3, N3 = 8, [0], {4: .8}, 3 # Bulk. rowids = [rowid1, rowid2, rowid3] targets_list = [targets1, targets2, targets3] constraints_list = [constraints1, constraints2, constraints3] Ns = [N1, N2, N3] def test_correct_dimensions(statenos): # Invoke samples = engine.simulate_bulk(rowids, targets_list, constraints_list=constraints_list, Ns=Ns, statenos=statenos) assert len(samples) == (engine.num_states() if statenos is None else len(statenos)) for states_samples in samples: assert len(states_samples) == len(rowids) for i, sample in enumerate(states_samples): assert len(sample) == Ns[i] for s in sample: assert set(s.keys()) == set(targets_list[i]) assert len(s) == len(targets_list[i]) test_correct_dimensions(None) test_correct_dimensions([4])
def test_dependence_probability__ci_(engine): engine = Engine.from_metadata(engine) results = engine.dependence_probability(0, 2, statenos=None) assert len(results) == engine.num_states() results = engine.dependence_probability(0, 2, statenos=[1, 4]) assert len(results) == 2
def _populate_from_metadata(model, metadata): model.initialized = metadata['initialized'] model.dataset = pd.DataFrame(metadata['dataset.values'], index=metadata['dataset.index'], columns=metadata['dataset.columns']) model.engine = Engine.from_metadata(metadata['engine']) \ if model.initialized else None return model
def test_row_similarity__ci_(engine): engine = Engine.from_metadata(engine) results = engine.row_similarity(0, 2, statenos=None) assert len(results) == engine.num_states() results = engine.row_similarity(0, 2, statenos=[1, 4, 5]) assert len(results) == 3
def run_test(args): n_rows = args["num_rows"] n_iters = args["num_iters"] n_chains = args["num_chains"] n_per_chain = int(float(n_rows) / n_chains) fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(16, 9)) axes = axes.ravel() k = 0 for shape in shapes: print "Shape: %s" % shape T_o = np.asarray(gen_function[shape](n_rows)) T_i = [] engine = Engine(T_o.T, cctypes=cctypes, distargs=distargs, num_states=n_chains) engine.transition(N=n_iters) for chain in xrange(n_chains): state = engine.get_state(chain) print "chain %i of %i" % (chain + 1, n_chains) T_i.extend(state.simulate(-1, [0, 1], N=n_per_chain)) T_i = np.array(T_i) ax = axes[k] ax.scatter(T_o[0], T_o[1], color='blue', edgecolor='none') ax.set_xlabel("X") ax.set_ylabel("Y") ax.set_title("%s original" % shape) ax = axes[k + 4] ax.scatter(T_i[:, 0], T_i[:, 1], color='red', edgecolor='none') ax.set_xlabel("X") ax.set_ylabel("Y") ax.set_xlim(ax.get_xlim()) ax.set_ylim(ax.get_ylim()) ax.set_title("%s simulated" % shape) k += 1 print "Done." return fig
def get_engine(): cctypes, distargs = cu.parse_distargs( ['normal', 'poisson', 'bernoulli', 'lognormal', 'beta', 'vonmises']) T, Zv, Zc = tu.gen_data_table(20, [1], [[.25, .25, .5]], cctypes, distargs, [.95] * len(cctypes), rng=gu.gen_rng(0)) T = T.T # Make some nan cells for evidence. T[5, 0] = T[5, 1] = T[5, 2] = T[5, 3] = np.nan T[8, 4] = np.nan engine = Engine(T, cctypes=cctypes, distargs=distargs, num_states=6, rng=gu.gen_rng(0)) engine.transition(N=2) return engine
def render_states_to_disk(filepath, prefix): engine = Engine.from_pickle(filepath) for i in range(engine.num_states()): print '\r%d' % (i, ) savefile = '%s-%d' % (prefix, i) state = engine.get_state(i) ru.viz_state(state, row_names=animal_names, col_names=animal_features, savefile=savefile)
def generate_gpmcc_posteriors(cctype, distargs, D_train, iters, seconds): """Learns gpmcc on D_train for seconds and simulates NUM_TEST times.""" # Learning and posterior simulation. engine = Engine(D_train, cctypes=[cctype], distargs=[distargs], num_states=64, rng=gu.gen_rng(1)) engine.transition(N=iters, S=seconds, progress=0) if iters: kernel = 'column_params' if cu.cctype_class(cctype).is_conditional()\ else 'column_hypers' engine.transition(N=100, kernels=[kernel], progress=0) samples = engine.simulate(-1, [0], N=NUM_TEST) marginals = engine.logpdf_score() ranking = np.argsort(marginals)[::-1] for r in ranking[:5]: engine.get_state(r).plot() return [samples[i] for i in ranking[:5]]
def test_engine_composition(): from cgpm.crosscat.engine import Engine X = np.asarray([ [1, 2, 0, 1], [1, 1, 0, 0], ]) engine = Engine(X[:, [3]], outputs=[3], cctypes=['normal'], num_states=2) cgpm = VsCGpm( outputs=[0, 1], inputs=[3], source=source_abstract, ) for i, row in enumerate(X): cgpm.incorporate(i, {0: row[0], 1: row[1]}, {3: row[3]}) cgpm.transition(N=2) engine.compose_cgpm([cgpm, cgpm], multiprocess=True)
def test_relevance_probability__ci_(engine): engine = Engine.from_metadata(engine) results = engine.relevance_probability(0, [2, 14], 0, statenos=None) assert len(results) == engine.num_states() results = engine.relevance_probability(0, [2, 14], 0, statenos=range(engine.num_states())) assert len(results) == engine.num_states()
def test_two_views_row_partition_bernoulli__ci_(lovecat): D = retrieve_bernoulli_dataset() if lovecat: engine = Engine(D.T, cctypes=['categorical'] * len(D), distargs=[{ 'k': 2 }] * len(D), Zv={ 0: 0, 1: 0, 2: 1, 3: 1 }, rng=gu.gen_rng(12), num_states=64) engine.transition_lovecat(N=100, kernels=[ 'row_partition_assignments', 'row_partition_hyperparameters', 'column_hyperparameters', ]) else: engine = Engine(D.T, cctypes=['bernoulli'] * len(D), Zv={ 0: 0, 1: 0, 2: 1, 3: 1 }, rng=gu.gen_rng(12), num_states=64) engine.transition(N=100, kernels=[ 'view_alphas', 'rows', 'column_hypers', ]) R1 = engine.row_similarity_pairwise(cols=[0, 1]) R2 = engine.row_similarity_pairwise(cols=[2, 3]) pu.plot_clustermap(R1) pu.plot_clustermap(R2) return engine
def get_engine(): X = [[0.123, 1, 0], [1.12, 0, 1], [1.1, 1, 2]] rng = gu.gen_rng(1) return Engine(X, outputs=[8, 7, 9], num_states=4, cctypes=['normal', 'bernoulli', 'categorical'], distargs=[None, None, { 'k': 3 }], rng=rng)
def test_dependence_probability_pairwise(): cctypes, distargs = cu.parse_distargs(['normal', 'normal', 'normal']) T, Zv, _Zc = tu.gen_data_table(10, [.5, .5], [[.25, .25, .5], [.3, .7]], cctypes, distargs, [.95] * len(cctypes), rng=gu.gen_rng(100)) outputs = [0, 1, 2] engine = Engine(T.T, outputs=outputs, cctypes=cctypes, num_states=4, distargs=distargs, Zv={o: z for o, z in zip(outputs, Zv)}, rng=gu.gen_rng(0)) Ds = engine.dependence_probability_pairwise(multiprocess=0) assert len(Ds) == engine.num_states() assert all(np.shape(D) == (len(outputs), len(outputs)) for D in Ds) for D in Ds: for col0, col1 in itertools.product(outputs, outputs): i0 = outputs.index(col0) i1 = outputs.index(col1) actual = D[i0, i1] expected = Zv[i0] == Zv[i1] assert actual == expected Ds = engine.dependence_probability_pairwise(colnos=[0, 2], multiprocess=0) assert len(Ds) == engine.num_states() assert all(np.shape(D) == (2, 2) for D in Ds)
def launch_analysis(): engine = Engine(animals.values.astype(float), num_states=64, cctypes=['categorical'] * len(animals.values[0]), distargs=[{ 'k': 2 }] * len(animals.values[0]), rng=gu.gen_rng(7)) engine.transition(N=900) with open('resources/animals/animals.engine', 'w') as f: engine.to_pickle(f) engine = Engine.from_pickle(open('resources/animals/animals.engine', 'r')) D = engine.dependence_probability_pairwise() pu.plot_clustermap(D)
def test_incorporate_engine(): engine = Engine( T[:,:2], cctypes=CCTYPES[:2], distargs=DISTARGS[:2], num_states=4, rng=gu.gen_rng(0), ) engine.transition(N=5) # Incorporate a new dim into with a non-contiguous output. engine.incorporate_dim( T[:,2], outputs=[10], cctype=CCTYPES[2], distargs=DISTARGS[2] ) engine.transition(N=2) # Serialize the engine, and run a targeted transtion on variable 10. m = engine.to_metadata() engine2 = Engine.from_metadata(m) engine2.transition(N=2, cols=[10], multiprocess=0) assert all(s.outputs == [0,1,10] for s in engine.states)
def from_metadata(metadata, seed): model = TRCRP_Mixture( chains=metadata['chains'], lag=metadata['lag'], variables=metadata['variables'], rng=np.random.RandomState(seed), ) # Internal fields. model.initialized = metadata['initialized'] model.dataset = pd.DataFrame(metadata['dataset.values'], index=metadata['dataset.index'], columns=metadata['dataset.columns']) model.engine = Engine.from_metadata(metadata['engine']) \ if model.initialized else None return model
def test_dependence_probability(): '''Test that Loom correctly recovers a 2-view dataset.''' D, Zv, Zc = tu.gen_data_table(n_rows=150, view_weights=None, cluster_weights=[ [.2, .2, .2, .4], [.3, .2, .5], ], cctypes=['normal'] * 6, distargs=[None] * 6, separation=[0.95] * 6, view_partition=[0, 0, 0, 1, 1, 1], rng=gu.gen_rng(12)) engine = Engine( D.T, outputs=[7, 2, 12, 80, 129, 98], cctypes=['normal'] * len(D), distargs=[None] * 6, rng=gu.gen_rng(122), num_states=20, ) logscore0 = engine.logpdf_score() engine.transition_loom(N=100) logscore1 = engine.logpdf_score() assert numpy.mean(logscore1) > numpy.mean(logscore0) dependence_probability = numpy.mean( engine.dependence_probability_pairwise(), axis=0) assert dependence_probability[0, 1] > 0.8 assert dependence_probability[1, 2] > 0.8 assert dependence_probability[0, 2] > 0.8 assert dependence_probability[3, 4] > 0.8 assert dependence_probability[4, 5] > 0.8 assert dependence_probability[3, 5] > 0.8 assert dependence_probability[0, 3] < 0.2 assert dependence_probability[0, 4] < 0.2 assert dependence_probability[0, 5] < 0.2 assert dependence_probability[1, 3] < 0.2 assert dependence_probability[1, 4] < 0.2 assert dependence_probability[1, 5] < 0.2 assert dependence_probability[2, 3] < 0.2 assert dependence_probability[2, 4] < 0.2 assert dependence_probability[2, 5] < 0.2
def test_entropy_bernoulli_bivariate__ci_(): rng = gen_rng(10) # Generate a bivariate Bernoulli dataset. PX = [.3, .7] PY = [[.2, .8], [.6, .4]] TX = rng.choice([0, 1], p=PX, size=250) TY = np.zeros(shape=len(TX)) TY[TX == 0] = rng.choice([0, 1], p=PY[0], size=len(TX[TX == 0])) TY[TX == 1] = rng.choice([0, 1], p=PY[0], size=len(TX[TX == 1])) T = np.column_stack((TY, TX)) engine = Engine( T, cctypes=['categorical', 'categorical'], distargs=[{ 'k': 2 }, { 'k': 2 }], num_states=64, rng=rng, ) engine.transition_lovecat(N=200) # exact computation entropy_exact = (-PX[0] * PY[0][0] * np.log(PX[0] * PY[0][0]) - PX[0] * PY[0][1] * np.log(PX[0] * PY[0][1]) - PX[1] * PY[1][0] * np.log(PX[1] * PY[1][0]) - PX[1] * PY[1][1] * np.log(PX[1] * PY[1][1])) # logpdf computation logps = engine.logpdf_bulk([-1, -1, -1, -1], [{ 0: 0, 1: 0 }, { 0: 0, 1: 1 }, { 0: 1, 1: 0 }, { 0: 1, 1: 1 }]) entropy_logpdf = [-np.sum(np.exp(logp) * logp) for logp in logps] # mutual_information computation. entropy_mi = engine.mutual_information([0, 1], [0, 1], N=1000) # Punt CLT analysis and go for a small tolerance. assert np.allclose(entropy_exact, entropy_logpdf, atol=.15) assert np.allclose(entropy_exact, entropy_mi, atol=.15) assert np.allclose(entropy_logpdf, entropy_mi, atol=.1)
def test_multiple_stattypes(): '''Test cgpm statistical types are heuristically converted to Loom types.''' cctypes, distargs = cu.parse_distargs([ 'normal', 'poisson', 'bernoulli', 'categorical(k=4)', 'lognormal', 'exponential', 'beta', 'geometric', 'vonmises' ]) T, Zv, Zc = tu.gen_data_table(200, [1], [[.25, .25, .5]], cctypes, distargs, [.95] * len(cctypes), rng=gu.gen_rng(10)) engine = Engine( T.T, cctypes=cctypes, distargs=distargs, rng=gu.gen_rng(15), num_states=16, ) logscore0 = engine.logpdf_score() engine.transition_loom(N=5) logscore1 = engine.logpdf_score() assert numpy.mean(logscore1) > numpy.mean(logscore0) # Check serializeation. metadata = engine.to_metadata() modname = importlib.import_module(metadata['factory'][0]) builder = getattr(modname, metadata['factory'][1]) engine2 = builder.from_metadata(metadata) # To JSON. json_metadata = json.dumps(engine.to_metadata()) engine3 = builder.from_metadata(json.loads(json_metadata)) # Assert all states in engine, engine2, and engine3 have same loom_path. loom_paths = list( itertools.chain.from_iterable([s._loom_path for s in e.states] for e in [engine, engine2, engine3])) assert all(p == loom_paths[0] for p in loom_paths) engine2.transition(S=5) dependence_probability = engine2.dependence_probability_pairwise() assert numpy.all(dependence_probability > 0.85)
def test_errors(): """Targets loomcat._validate_transition.""" D, Zv, Zc = tu.gen_data_table(n_rows=150, view_weights=None, cluster_weights=[ [.2, .2, .2, .4], [.3, .2, .5], ], cctypes=['normal'] * 6, distargs=[None] * 6, separation=[0.95] * 6, view_partition=[0, 0, 0, 1, 1, 1], rng=gu.gen_rng(12)) state = State( D.T, outputs=range(10, 16), cctypes=['normal'] * len(D), distargs=[None] * 6, rng=gu.gen_rng(122), ) engine = Engine( D.T, outputs=range(10, 16), cctypes=['normal'] * len(D), distargs=[None] * 6, rng=gu.gen_rng(122), ) def check_errors(cgpm): with pytest.raises(ValueError): cgpm.transition_loom(N=10, S=5) with pytest.raises(ValueError): cgpm.transition_loom(N=10, kernels=['alpha']) with pytest.raises(ValueError): cgpm.transition_loom(N=10, progress=True) with pytest.raises(ValueError): cgpm.transition_loom(N=10, progress=True) with pytest.raises(ValueError): cgpm.transition_loom(N=10, checkpoint=2) cgpm.transition_loom(N=2) check_errors(state) check_errors(engine)
def gen_simple_engine(multiprocess=1): data = np.array([[1, 1, 1]]) R = len(data) D = len(data[0]) outputs = range(D) engine = Engine( X=data, num_states=20, rng=gu.gen_rng(1), multiprocess=multiprocess, outputs=outputs, alpha=1., cctypes=['bernoulli']*D, distargs={i: {'alpha': 1., 'beta': 1.} for i in outputs}, Zv={0: 0, 1: 0, 2: 1}, view_alphas=[1.]*D, Zrv={0: [0]*R, 1: [0]*R}) return engine