def engine(): # Set up the data generation cctypes, distargs = cu.parse_distargs([ 'normal', 'poisson', 'bernoulli', 'categorical(k=4)', 'lognormal', 'exponential', 'beta', 'geometric', 'vonmises', ]) T, Zv, Zc = tu.gen_data_table(20, [1], [[.25, .25, .5]], cctypes, distargs, [.95] * len(cctypes), rng=gu.gen_rng(10)) return Engine(T.T, cctypes=cctypes, distargs=distargs, num_states=4, rng=gu.gen_rng(312), multiprocess=False)
def test_dependence_probability_pairwise(): cctypes, distargs = cu.parse_distargs(['normal', 'normal', 'normal']) T, Zv, _Zc = tu.gen_data_table(10, [.5, .5], [[.25, .25, .5], [.3, .7]], cctypes, distargs, [.95] * len(cctypes), rng=gu.gen_rng(100)) outputs = [0, 1, 2] engine = Engine(T.T, outputs=outputs, cctypes=cctypes, num_states=4, distargs=distargs, Zv={o: z for o, z in zip(outputs, Zv)}, rng=gu.gen_rng(0)) Ds = engine.dependence_probability_pairwise(multiprocess=0) assert len(Ds) == engine.num_states() assert all(np.shape(D) == (len(outputs), len(outputs)) for D in Ds) for D in Ds: for col0, col1 in itertools.product(outputs, outputs): i0 = outputs.index(col0) i1 = outputs.index(col1) actual = D[i0, i1] expected = Zv[i0] == Zv[i1] assert actual == expected Ds = engine.dependence_probability_pairwise(colnos=[0, 2], multiprocess=0) assert len(Ds) == engine.num_states() assert all(np.shape(D) == (2, 2) for D in Ds)
def state(): # Set up the data generation cctypes, distargs = cu.parse_distargs( ['normal', 'poisson', 'bernoulli', 'lognormal', 'beta', 'vonmises']) T, Zv, Zc = tu.gen_data_table(30, [1], [[.25, .25, .5]], cctypes, distargs, [.95] * len(cctypes), rng=gu.gen_rng(0)) T = T.T s = State(T, cctypes=cctypes, distargs=distargs, Zv={i: 0 for i in xrange(len(cctypes))}, rng=gu.gen_rng(0)) return s
def test_multiple_stattypes(): '''Test cgpm statistical types are heuristically converted to Loom types.''' cctypes, distargs = cu.parse_distargs([ 'normal', 'poisson', 'bernoulli', 'categorical(k=4)', 'lognormal', 'exponential', 'beta', 'geometric', 'vonmises' ]) T, Zv, Zc = tu.gen_data_table(200, [1], [[.25, .25, .5]], cctypes, distargs, [.95] * len(cctypes), rng=gu.gen_rng(10)) engine = Engine( T.T, cctypes=cctypes, distargs=distargs, rng=gu.gen_rng(15), num_states=16, ) logscore0 = engine.logpdf_score() engine.transition_loom(N=5) logscore1 = engine.logpdf_score() assert numpy.mean(logscore1) > numpy.mean(logscore0) # Check serializeation. metadata = engine.to_metadata() modname = importlib.import_module(metadata['factory'][0]) builder = getattr(modname, metadata['factory'][1]) engine2 = builder.from_metadata(metadata) # To JSON. json_metadata = json.dumps(engine.to_metadata()) engine3 = builder.from_metadata(json.loads(json_metadata)) # Assert all states in engine, engine2, and engine3 have same loom_path. loom_paths = list( itertools.chain.from_iterable([s._loom_path for s in e.states] for e in [engine, engine2, engine3])) assert all(p == loom_paths[0] for p in loom_paths) engine2.transition(S=5) dependence_probability = engine2.dependence_probability_pairwise() assert numpy.all(dependence_probability > 0.85)
def get_engine(): cctypes, distargs = cu.parse_distargs( ['normal', 'poisson', 'bernoulli', 'lognormal', 'beta', 'vonmises']) T, Zv, Zc = tu.gen_data_table(20, [1], [[.25, .25, .5]], cctypes, distargs, [.95] * len(cctypes), rng=gu.gen_rng(0)) T = T.T # Make some nan cells for evidence. T[5, 0] = T[5, 1] = T[5, 2] = T[5, 3] = np.nan T[8, 4] = np.nan engine = Engine(T, cctypes=cctypes, distargs=distargs, num_states=6, rng=gu.gen_rng(0)) engine.transition(N=2) return engine
def state(): cctypes, distargs = cu.parse_distargs( ['categorical(k=5)', 'normal', 'poisson', 'bernoulli']) T, Zv, Zc = tu.gen_data_table(50, [1], [[.33, .33, .34]], cctypes, distargs, [.95] * len(cctypes), rng=gu.gen_rng(0)) s = State(T.T, cctypes=cctypes, distargs=distargs, Zv={i: 0 for i in xrange(len(cctypes))}, rng=gu.gen_rng(0)) s.update_cctype(0, 'random_forest', distargs={'k': 5}) # XXX Uncomment me for a bug! # state.update_cctype(1, 'linear_regression') kernels = [ 'rows', 'view_alphas', 'alpha', 'column_params', 'column_hypers' ] s.transition(N=1, kernels=kernels) return s
import matplotlib.pyplot as plt import numpy as np import seaborn as sns from cgpm.crosscat.engine import Engine from cgpm.utils import config as cu np.random.seed(0) N_ROWS = 300 N_STATES = 12 N_ITERS = 100 cctypes = ['categorical(k={})'.format(N_ROWS)] + ['normal']*8 cctypes, distargs = cu.parse_distargs(cctypes) column_names = ['id'] + ['one cluster']*4 + ['four cluster']*4 # id column. X = np.zeros((N_ROWS, 9)) X[:,0] = np.arange(N_ROWS) # Four columns of one cluster from the standard normal. X[:,1:5] = np.random.randn(N_ROWS, 4) # Four columns of four clusters with unit variance and means \in {0,1,2,3}. Z = np.random.randint(4, size=(N_ROWS)) X[:,5:] = 4*np.reshape(np.repeat(Z,4), (len(Z),4)) + np.random.randn(N_ROWS, 4) # Inference. engine = Engine(
import numpy as np from cgpm.crosscat import lovecat from cgpm.crosscat.engine import Engine from cgpm.crosscat.state import State from cgpm.utils import config as cu from cgpm.utils import general as gu from cgpm.utils import test as tu # -- Global variables shared by all module functions. rng = gu.gen_rng(2) outputs = range(8) cctypes, distargs = cu.parse_distargs([ 'normal', 'poisson', 'bernoulli', 'categorical(k=8)', 'lognormal', 'categorical(k=4)', 'beta', 'vonmises' ]) def generate_dataset(): # Set up the data generation, 20 rows by 8 cols, with some missing values. D, Zv, Zc = tu.gen_data_table(20, [1], [[.25, .25, .5]], cctypes, distargs, [.95] * len(cctypes), rng=gu.gen_rng(2)) # Generate some missing entries in D. missing = rng.choice(range(D.shape[1]), size=(D.shape[0], 4), replace=True) for i, m in enumerate(missing): D[i, m] = np.nan
# See the License for the specific language governing permissions and # limitations under the License. import pytest from cgpm.crosscat.engine import Engine from cgpm.crosscat.state import State from cgpm.utils import config as cu from cgpm.utils import general as gu from cgpm.utils import test as tu CCTYPES, DISTARGS = cu.parse_distargs([ 'normal', # 0 'poisson', # 1 'bernoulli', # 2 'lognormal', # 3 'exponential', # 4 'geometric', # 5 'vonmises']) # 6 T, Zv, Zc = tu.gen_data_table( 10, [1], [[.33, .33, .34]], CCTYPES, DISTARGS, [.95]*len(CCTYPES), rng=gu.gen_rng(0)) T = T.T def test_incorporate_engine(): engine = Engine( T[:,:2], cctypes=CCTYPES[:2],
# limitations under the License. import importlib import pytest from math import log import numpy as np from cgpm.regressions.ols import OrdinaryLeastSquares from cgpm.utils import config as cu from cgpm.utils import general as gu from cgpm.utils import test as tu cctypes, distargs = cu.parse_distargs([ 'normal', 'categorical(k=3)', 'poisson', 'bernoulli', 'lognormal', 'exponential', 'geometric', 'vonmises', 'normal' ]) T, Zv, Zc = tu.gen_data_table(100, [1], [[.33, .33, .34]], cctypes, distargs, [.2] * len(cctypes), rng=gu.gen_rng(0)) D = T.T OLS_DISTARGS = { 'inputs': { 'stattypes': cctypes[1:], 'statargs': [{ 'k': 3 }] + [None] + [{
# See the License for the specific language governing permissions and # limitations under the License. import pytest from cgpm.crosscat.state import State from cgpm.utils import config as cu from cgpm.utils import general as gu from cgpm.utils import test as tu CCTYPES, DISTARGS = cu.parse_distargs([ 'normal', 'poisson', 'categorical(k=2)', 'bernoulli', 'lognormal', 'exponential', 'geometric', 'vonmises']) T, Zv, Zc = tu.gen_data_table( 20, [1], [[.33, .33, .34]], CCTYPES, DISTARGS, [.95]*len(CCTYPES), rng=gu.gen_rng(0)) T = T.T def test_categorical_bernoulli(): state = State( T, cctypes=CCTYPES, distargs=DISTARGS, rng=gu.gen_rng(0))
def parse_schema(schema, dataframe): """Apply a schema to a dataframe, and return variables to construct State. Parameters ---------- schema : list(tuple) A list of tuples, where each tuple is ('column', 'stattype'). The values of 'stattype' are either DistirbutionGpms, or 'ignore'. For categorical datatypes, it is permitted to specify the number of components distarg by 'categorical(k=7)' although make sure the number of components is correct; if unspecified, the number of components will be estimated from the dataset. dataframe : pd.DataFrame Dataframe containing the dataset to parse according to the schema. All missing values must be 'NA' or np.nan -- otherwise very bad things will happen. Returns ------- D : np.array Data matrix that gpmcc can ingest. cctypes : list<str> List of cctype strings that gpmcc can ingest. distargs : list<dict> Distargs for the cctypes, according to the schema. valmap : dict<str->dict> For Bernoulli or categorical columns, strings are converted to integer values in [0..k]. valmap['column'] gives the mapping from strings to integers for such columns. Needed for reference only, not for gpmcc. columns : list<str> List of column names, where columns[i] is the ith column of D. Needed for reference only, not for gpmcc. Example ------- >>> dataframe = pd.read_csv('dataset.csv') >>> schema = [('id','ignore'), ('age','normal'), ('gender','bernoulli'), ... ('university','categorical(k=2)'), ('country','categorical')] >>> D, cctypes, distargs, valmap, columns = parse_schema(dataframe, schema) - D will be the dataset as np array. - cctypes = ['normal', 'bernoulli', 'categorical', 'categorical'] - distargs = [None, None, {'k':2}, {'k':3}] - valmap = { 'university': { 'mit': 0, 'harvard': 1 }, 'country': { 'usa': 0, 'nepal': 1, 'lebanon': 2 } } where 'k' for 'country' has been extracted from the dataset. >>> S = cgpm.crosscat.state.State(D, cctypes=cctypes, distargs=distargs) """ dataframe.replace('NA', np.nan, inplace=True) D = [] cctypes, distargs = [], [] valmap = dict() columns = [] outputs = [] for column, stattype, index in schema: if stattype == 'ignore': continue X = dataframe[column] columns.append(column) cctypes.append(stattype) outputs.append(index) distargs.append(None) # XXX Should check for is_numeric! if stattype in ['bernoulli', 'categorical']: mapping = build_valmap(X) X = X.replace(mapping) valmap[column] = mapping if stattype == 'bernoulli': assert len(mapping) == 2 else: # Did user specify categorical mapping? dist, k_user = cu.parse_distargs([column]) if k_user == [None]: distargs[-1] = {'k': len(mapping)} else: assert len(mapping) <= k_user D.append(X) T = np.asarray(D).T assert len(cctypes) == len(distargs) == len(columns) assert len(columns) == T.shape[1] return T, outputs, cctypes, distargs, valmap, columns
def test_serialize_composite_cgpm(): rng = gu.gen_rng(2) # Generate the data. cctypes, distargs = cu.parse_distargs([ 'categorical(k=3)', # RandomForest 0 'normal', # LinearRegression 1 'categorical(k=3)', # GPMCC 2 'poisson', # GPMCC 3 'normal', # GPMCC 4 'lognormal' # GPMCC 5 ]) T, Zv, Zc = tu.gen_data_table( 35, [.4, .6], [[.33, .33, .34], [.5, .5]], cctypes, distargs, [.2]*len(cctypes), rng=rng) D = np.transpose(T) # Create GPMCC. state = State( D[:,2:], outputs=[2,3,4,5], cctypes=cctypes[2:], distargs=distargs[2:], rng=rng) # Create a Forest. forest = RandomForest( outputs=[0], inputs=[1,2,3,4], distargs={ 'inputs': { 'stattypes': [cctypes[i] for i in [1,2,3,4]], 'statargs': [distargs[i] for i in [1,2,3,4]]}, 'k': distargs[0]['k']}, rng=rng) # Create a Regression. linreg = LinearRegression( outputs=[1], inputs=[3,4,5], distargs={ 'inputs': { 'stattypes': [cctypes[i] for i in [3,4,5]], 'statargs': [distargs[i] for i in [3,4,5]]}}, rng=rng) # Incorporate the data. def incorporate_data(cgpm, rowid, row): cgpm.incorporate( rowid, {i: row[i] for i in cgpm.outputs}, {i: row[i] for i in cgpm.inputs}, ) for rowid, row in enumerate(D): incorporate_data(forest, rowid, row) incorporate_data(linreg, rowid, row) # Compose the CGPMs. # Run state transitions. state.transition(N=10, progress=False) # Compose CGPMs, instructing State to run the transitions. token_forest = state.compose_cgpm(forest) token_linreg = state.compose_cgpm(linreg) state.transition_foreign(N=10, cols=[forest.outputs[0], linreg.outputs[0]]) # Now run the serialization. metadata = state.to_metadata() state2 = State.from_metadata(metadata) # Check that the tokens are in state2. assert token_forest in state2.hooked_cgpms assert token_linreg in state2.hooked_cgpms # The hooked cgpms must be unique objects after serialize/deserialize. assert state.hooked_cgpms[token_forest] != state2.hooked_cgpms[token_forest] assert state.hooked_cgpms[token_linreg] != state2.hooked_cgpms[token_linreg] # Check that the log scores of the hooked cgpms agree. assert np.allclose( state.hooked_cgpms[token_forest].logpdf_score(), state2.hooked_cgpms[token_forest].logpdf_score()) assert np.allclose( state.hooked_cgpms[token_linreg].logpdf_score(), state2.hooked_cgpms[token_linreg].logpdf_score()) # Now run some tests for the engine. e = Engine( D[:,2:], outputs=[2,3,4,5], cctypes=cctypes[2:], distargs=distargs[2:], num_states=2, rng=rng) e.compose_cgpm([forest, forest], multiprocess=1) e.compose_cgpm([linreg, linreg], multiprocess=1) e.transition_foreign(N=1, cols=[forest.outputs[0], linreg.outputs[0]]) e.dependence_probability(0,1) e.simulate(-1, [0,1], {2:1}, multiprocess=0) e.logpdf(-1, {1:1}, {2:1, 0:0}, multiprocess=0) state3 = e.get_state(0) # There is no guarantee that the logpdf score improves with inference, but # it should reduce by more than a few nats. def check_logpdf_delta(before, after): return before < after or (after-before) < 5 check_logpdf_delta( before=state.hooked_cgpms[token_forest].logpdf_score(), after=state3.hooked_cgpms[token_forest].logpdf_score()) check_logpdf_delta( before=state.hooked_cgpms[token_linreg].logpdf_score(), after=state3.hooked_cgpms[token_linreg].logpdf_score())
def test_dependence_probability(): cctypes, distargs = cu.parse_distargs( ['normal', 'poisson', 'bernoulli', 'lognormal', 'beta', 'vonmises']) T, Zv, Zc = tu.gen_data_table(100, [.5, .5], [[.25, .25, .5], [.3, .7]], cctypes, distargs, [.95] * len(cctypes), rng=gu.gen_rng(100)) T = T.T outputs = range(0, 12, 2) # Test for direct dependence for state and engine. s = State(T, outputs=outputs, cctypes=cctypes, distargs=distargs, Zv={o: z for o, z in zip(outputs, Zv)}, rng=gu.gen_rng(0)) e = Engine(T, outputs=outputs, cctypes=cctypes, distargs=distargs, Zv={o: z for o, z in zip(outputs, Zv)}, rng=gu.gen_rng(0)) for C in [s, e]: for col0, col1 in itertools.product(outputs, outputs): i0 = outputs.index(col0) i1 = outputs.index(col1) assert (compute_depprob(C.dependence_probability( col0, col1)) == (Zv[i0] == Zv[i1])) # Hook some cgpms into state. # XXX What if Zv has only one unique value? Hopefully not with this rng! uniques = list(set(Zv)) parent_1 = [o for i, o in enumerate(outputs) if Zv[i] == uniques[0]] parent_2 = [o for i, o in enumerate(outputs) if Zv[i] == uniques[1]] c1 = BareBonesCGpm(outputs=[1821, 154], inputs=[parent_1[0]]) c2 = BareBonesCGpm(outputs=[1721], inputs=[parent_2[0]]) c3 = BareBonesCGpm(outputs=[9721], inputs=[parent_2[1]]) c4 = BareBonesCGpm(outputs=[74], inputs=[9721]) for i, C in enumerate([s, e]): C.compose_cgpm(c1 if i == 0 else [c1]) C.compose_cgpm(c2 if i == 0 else [c2]) C.compose_cgpm(c3 if i == 0 else [c3]) C.compose_cgpm(c4 if i == 0 else [c4]) # Between hooked cgpms and state parents. for p in parent_1: assert compute_depprob(C.dependence_probability(1821, p)) == 1 assert compute_depprob(C.dependence_probability(154, p)) == 1 assert compute_depprob(C.dependence_probability(1721, p)) == 0 assert compute_depprob(C.dependence_probability(9721, p)) == 0 assert compute_depprob(C.dependence_probability(74, p)) == 0 for p in parent_2: assert compute_depprob(C.dependence_probability(1821, p)) == 0 assert compute_depprob(C.dependence_probability(154, p)) == 0 assert compute_depprob(C.dependence_probability(1721, p)) == 1 assert compute_depprob(C.dependence_probability(9721, p)) == 1 assert compute_depprob(C.dependence_probability(74, p)) == 1 # Between hooked cgpm. assert compute_depprob(C.dependence_probability(9721, 1721)) == 1 assert compute_depprob(C.dependence_probability(1821, 154)) == 1 assert compute_depprob(C.dependence_probability(74, 9721)) == 1 assert compute_depprob(C.dependence_probability(74, 1721)) == 1 assert compute_depprob(C.dependence_probability(1821, 1721)) == 0 assert compute_depprob(C.dependence_probability(1821, 74)) == 0 assert compute_depprob(C.dependence_probability(154, 74)) == 0