def get_crosscat(prng): view = FlexibleRowMixture( cgpm_row_divide=CRP([2], [], rng=prng), cgpm_components_base=Product([ Normal([0], [], rng=prng), Normal([1], [], rng=prng), ], rng=prng), rng=prng) return Product(cgpms=[view], rng=prng)
def test_crosscat_two_component_nominal__ci_(): prng = get_prng(10) integration = pytest.config.getoption('--integration') # Build CGPM with adversarial initialization. crosscat = Product([ FlexibleRowMixture( cgpm_row_divide=CRP([-1], [], rng=prng), cgpm_components_base=Product([ Normal([0], [], rng=prng), ], rng=prng), rng=prng), FlexibleRowMixture( cgpm_row_divide=CRP([-2], [], rng=prng), cgpm_components_base=Product([ Normal([1], [], rng=prng), Categorical([50], [], distargs={'k':4}, rng=prng), ], rng=prng), rng=prng), ], rng=prng,) # Fetch data and add a nominal variable. data_xy = make_bivariate_two_clusters(prng) data_z = np.zeros(len(data_xy)) data_z[:15] = 0 data_z[15:30] = 1 data_z[30:45] = 2 data_z[45:60] = 3 data = np.column_stack((data_xy, data_z)) # Observe. for rowid, row in enumerate(data): crosscat.observe(rowid, {0: row[0], 1: row[1], 50:row[2]}) # Run inference. synthesizer = GibbsCrossCat(crosscat) synthesizer.transition(N=(50 if integration else 1), progress=False) synthesizer.transition(N=(100 if integration else 1), kernels=['hypers_distributions','hypers_row_divide'], progress=False) # Assert views are merged into one. assert not integration or len(synthesizer.crosscat.cgpms) == 1 crp_output = synthesizer.crosscat.cgpms[0].cgpm_row_divide.outputs[0] # Check joint samples for all nominals. samples = synthesizer.crosscat.simulate(None, [crp_output,0,1,50], N=250) not integration or check_sampled_data(samples, [0, 7], 3, 110) # Check joint samples for nominals [0, 2]. samples_a = [s for s in samples if s[50] in [0,2]] not integration or check_sampled_data(samples_a, [0, 7], 3, 45) # Check joint samples for nominals [1, 3]. samples_b = [s for s in samples if s[50] in [1,3]] not integration or check_sampled_data(samples_b, [0, 7], 3, 45) # Check conditional samples in correct quadrants. means = {0:0, 1:0, 2:7, 3:7} for z in [0, 1, 2, 3]: samples = synthesizer.crosscat.simulate(None, [0, 1], {50:z}, N=100) not integration or check_sampled_data(samples, [means[z]], 3, 90)
def test_flexible_mixture_two_component__ci_(): prng = get_prng(2) integration = pytest.config.getoption('--integration') flexible_mixture = FlexibleRowMixture( cgpm_row_divide=CRP([2], [], rng=prng), cgpm_components_base=Product([ Normal([0], [], rng=prng), Normal([1], [], rng=prng), ], rng=prng), rng=prng) run_mixture_test(flexible_mixture, integration, prng)
def test_simple_product(): prng = get_prng(2) column0 = Normal([0], [], rng=prng) column1 = Normal([1], [], rng=prng) column2 = Categorical([2], [], distargs={'k':4}, rng=prng) product = Product([column0, column1, column2], prng) assert product.outputs == [0,1,2] assert product.inputs == [] sample = product.simulate(None, [1,2,0]) assert set(sample.keys()) == set([1, 2, 0]) logp = product.logpdf(None, sample) assert logp < 0
def test_finite_mixture_three_component__ci_(): prng = get_prng(2) finite_mixture = FiniteRowMixture( cgpm_row_divide=Categorical([1], [], distargs={'k': 3}, rng=prng), cgpm_components=[ Normal([0], [], rng=prng), Normal([0], [], rng=prng), Normal([0], [], rng=prng) ], rng=prng, ) integration = pytest.config.getoption('--integration') run_mixture_test(finite_mixture, integration, prng)
def get_crosscat_synthesizer(prng): view = FlexibleRowMixture(cgpm_row_divide=CRP([2], [], rng=prng), cgpm_components_base=Product([ Normal([0], [], rng=prng), Normal([1], [], rng=prng), ], rng=prng), rng=prng) crosscat = Product(cgpms=[view], rng=prng) data = make_bivariate_two_clusters(prng) for rowid, row in enumerate(data): crosscat.observe(rowid, {0: row[0], 1: row[1]}) return GibbsCrossCat(crosscat)
def test_transition_hypers_basic(): prng = get_prng(2) component0 = Product([ Poisson([0], [], hypers={'m': 100}, rng=prng), Normal([1], [], hypers={'m': 100}, rng=prng) ], rng=prng) cgpm_row_divide = CRP([2], [], rng=prng) infinite_mixture = FlexibleRowMixture(cgpm_row_divide=cgpm_row_divide, cgpm_components_base=component0, rng=prng) # Make normal observations. infinite_mixture.observe(0, {1: 100}) infinite_mixture.observe(1, {1: 300}) infinite_mixture.observe(2, {1: -300}) # Fetch log score. log_score0 = infinite_mixture.logpdf_score() # Run inference. normal_cgpms = get_cgpms_by_output_index(infinite_mixture, 1) grids_normal = transition_hyper_grids(normal_cgpms, 30) hypers_normal = [ transition_hypers(normal_cgpms, grids_normal, prng) for _i in xrange(2) ] assert not all(hypers == hypers_normal[0] for hypers in hypers_normal) log_score1 = infinite_mixture.logpdf_score() assert log_score0 < log_score1
def test_finite_mixture_probabilities(): prng = get_prng(2) component0 = Normal([0], [], hypers={'m':100}, rng=prng) component1 = Normal([0], [], hypers={'m':-100}, rng=prng) component2 = Normal([0], [], hypers={'m':0}, rng=prng) cgpm_row_divide = Categorical([1], [], distargs={'k':3}, rng=prng) finite_mixture = FiniteRowMixture( cgpm_row_divide=cgpm_row_divide, cgpm_components=[component0, component1, component2], rng=prng) # Make observations into component0 at prior mean. finite_mixture.observe(1, {0:100, 1:0}) finite_mixture.observe(2, {0:100, 1:0}) finite_mixture.observe(3, {0:100, 1:0}) assert finite_mixture.rowid_to_component == {1:0, 2:0, 3:0} # Sample of cluster assignments given data is 100. samples = finite_mixture.simulate(None, [1], constraints={0:100}, N=20) assert len([s for s in samples if s[1]==0]) > int(0.9*len(samples)) # Compute likelihood of data given cluster assignment. lp0 = finite_mixture.logpdf(None, {0:100}, constraints={1:0}) lp1 = finite_mixture.logpdf(None, {0:100}, constraints={1:1}) lp2 = finite_mixture.logpdf(None, {0:100}, constraints={1:2}) assert lp1 < lp0 assert lp2 < lp0 assert lp1 < lp2 # Compute posterior probabilities of cluster assignment lp0 = finite_mixture.logpdf(None, {1:0}, constraints={0:100}) lp1 = finite_mixture.logpdf(None, {1:1}, constraints={0:100}) lp2 = finite_mixture.logpdf(None, {1:2}, constraints={0:100}) assert lp1 < lp0 assert lp2 < lp0 assert lp1 < lp2 # Constrained cluster has zero density. with pytest.raises(ValueError): lp2 = finite_mixture.logpdf(None, {0:100}, constraints={1:-1}) with pytest.raises(ValueError): lp2 = finite_mixture.logpdf(None, {0:100}, constraints={1:20}) with pytest.raises(ValueError): lp2 = finite_mixture.simulate(None, [0], constraints={1:-1}) with pytest.raises(ValueError): lp2 = finite_mixture.simulate(None, [0], constraints={1:20})
def test_simple_product_finite_array(): prng = get_prng(2) array0_component0 = Normal([0], [], hypers={'m':100}, rng=prng) array0_component1 = Normal([0], [], hypers={'m':-100}, rng=prng) array0_component2 = Normal([0], [], hypers={'m':0}, rng=prng) indexer_0 = 128 cgpm_array_0 = FiniteArray( cgpms=[array0_component0, array0_component1, array0_component2], indexer=indexer_0, rng=prng) with pytest.raises(Exception): # Missing indexer_0 as a required input. cgpm_array_0.simulate(None, [0]) array1_component0 = Normal([1], [], hypers={'m':1000}, rng=prng) array1_component1 = Normal([1], [], hypers={'m':-1000}, rng=prng) array1_component2 = Normal([1], [], hypers={'m':50}, rng=prng) indexer_1 = 129 cgpm_array_1 = FiniteArray( cgpms=[array1_component0, array1_component1, array1_component2], indexer=indexer_1, rng=prng) product = Product([cgpm_array_0, cgpm_array_1], prng) assert product.outputs == [0, 1] assert product.inputs == [indexer_0, indexer_1] with pytest.raises(Exception): # Missing indexer_0. product.simulate(None, [0,1], inputs={indexer_1: 0}) with pytest.raises(Exception): # Missing indexer_1. product.simulate(None, [0,1], inputs={indexer_0: 1}) # Should work, since output 1 is not being queried. product.simulate(None, [0], inputs={indexer_0: 1}) # Sampling from correct components. sample = product.simulate(None, [0,1], inputs={indexer_0:1, indexer_1:0}) assert abs(-100 - sample[0]) < 10 assert abs(1000 - sample[1]) < 10 logp = product.logpdf(None, sample, inputs={indexer_0:1, indexer_1:0}) assert np.allclose(logp, array0_component1.logpdf(None, {0: sample[0]}) + array1_component0.logpdf(None, {1: sample[1]}))
def test_simple_product_as_chain(): prng = get_prng(2) component0 = Chain([ Poisson([0], [], hypers={'a': 10, 'b': 1}, rng=prng), Normal([1], [], hypers={'m':100}, rng=prng) ], rng=prng) cgpm_row_divide = CRP([2], [], rng=prng) infinite_mixture = FlexibleRowMixture( cgpm_row_divide=cgpm_row_divide, cgpm_components_base=component0, rng=prng) assert infinite_mixture.cgpm_row_divide.support() == [0] # Test logpdf identities. lp0 = infinite_mixture.logpdf(None, {0:1}) assert lp0 < 0 lp1 = infinite_mixture.logpdf(None, {0:1, 2:0}) assert np.allclose(lp0, lp1) lp2 = infinite_mixture.logpdf(None, {0:1, 2:1}) assert lp2 == -float('inf') # Add an observation. infinite_mixture.observe(0, {1:100}) lp0 = infinite_mixture.logpdf(None, {1:100, 2:0}, constraints={0:1}) lp1 = infinite_mixture.logpdf(None, {1:100, 2:1}, constraints={0:1}) lp2 = infinite_mixture.logpdf(None, {1:100, 2:2}, constraints={0:1}) assert lp1 < lp0 assert lp2 == float('-inf') # Remove observation. observation = infinite_mixture.unobserve(0) assert observation == ({1:100, 2:0}, {}) # Remove observation again. with pytest.raises(Exception): infinite_mixture.unobserve(0) # Add more observations. infinite_mixture.observe(0, {1:100}) infinite_mixture.observe(1, {1:300}) infinite_mixture.observe(2, {0:2}) # Constrained cluster has zero density. with pytest.raises(ValueError): infinite_mixture.logpdf(None, {0:1}, constraints={2:10}) with pytest.raises(ValueError): infinite_mixture.logpdf(None, {0:1}, constraints={2:10}) # Convert to/from metadata and assert unobserves return correct data. metadata = infinite_mixture.to_metadata() infinite_mixture2 = FlexibleRowMixture.from_metadata(metadata, prng) assert infinite_mixture2.unobserve(0) == \ ({1:100, 2: infinite_mixture.cgpm_row_divide.data[0]}, {}) assert infinite_mixture2.unobserve(1) == \ ({1:300, 2: infinite_mixture.cgpm_row_divide.data[1]}, {}) assert infinite_mixture2.unobserve(2) == \ ({0:2, 2: infinite_mixture.cgpm_row_divide.data[2]}, {})
def get_crosscat(prng): view0 = FlexibleRowMixture( cgpm_row_divide=CRP([-1], [], rng=prng), cgpm_components_base=Product([ Normal([0], [], rng=prng), Normal([1], [], rng=prng), ], rng=prng), rng=prng) view1 = FlexibleRowMixture( cgpm_row_divide=CRP([-2], [], rng=prng), cgpm_components_base=Product([ Poisson([2], [], rng=prng), Normal([3], [], rng=prng), Normal([4], [], rng=prng), ], rng=prng), rng=prng) view2 = FlexibleRowMixture( cgpm_row_divide=CRP([-3], [], rng=prng), cgpm_components_base=Product([ Categorical([5], [], distargs={'k':4}, rng=prng), ], rng=prng), rng=prng) return Product([view0, view1, view2], rng=prng)
def test_add_remove(): prng = get_prng(2) mixture0 = FlexibleRowMixture( cgpm_row_divide=CRP([2], [], rng=prng), cgpm_components_base=Product([ Normal([0], [], rng=prng), Normal([1], [], rng=prng), ], rng=prng), rng=prng) for rowid, row in enumerate([[0,.9] ,[.5, 1], [-.5, 1.2]]): mixture0.observe(rowid, {0:row[0], 1:row[1]}) mixture1 = remove_cgpm(mixture0, 0) assert mixture0.outputs == [2, 0, 1] assert mixture1.outputs == [2, 1] mixture2 = add_cgpm(mixture1, Normal([0], [], rng=prng)) assert mixture0.outputs == [2, 0, 1] assert mixture1.outputs == [2, 1] assert mixture2.outputs == [2, 1, 0] mixture3 = remove_cgpm(mixture2, 1) assert mixture0.outputs == [2, 0, 1] assert mixture1.outputs == [2, 1] assert mixture2.outputs == [2, 1, 0] assert mixture3.outputs == [2, 0] mixture4 = remove_cgpm(mixture3, 0) assert mixture0.outputs == [2, 0, 1] assert mixture1.outputs == [2, 1] assert mixture2.outputs == [2, 1, 0] assert mixture3.outputs == [2, 0] assert mixture4.outputs == [2] with pytest.raises(Exception): # Cannot remove the cgpm_row_divide for a mixture. mixture3 = remove_cgpm(mixture2, 2)
def test_transition_crp_mixture(): prng = get_prng(2) data = np.concatenate(( prng.normal(loc=0, scale=2, size=20), prng.normal(loc=30, scale=1, size=20), prng.normal(loc=-30, scale=1, size=20), )) infinite_mixture = FlexibleRowMixture(cgpm_row_divide=CRP([1], [], rng=prng), cgpm_components_base=Normal( [0], [], rng=prng), rng=prng) for rowid, value in enumerate(data): infinite_mixture.observe(rowid, {0: value}) cgpms = { 0: get_cgpms_by_output_index(infinite_mixture, 0), 1: get_cgpms_by_output_index(infinite_mixture, 1), } grids = { 0: transition_hyper_grids(cgpms[0], 30), 1: transition_hyper_grids(cgpms[1], 30), } for _step in xrange(50): rowids = prng.permutation(range(len(data))) for rowid in rowids: transition_rows(infinite_mixture, rowid, prng) for output in infinite_mixture.outputs: transition_hypers(cgpms[output], grids[output], prng) rowids = range(60) assignments0 = [ infinite_mixture.simulate(r, [1])[1] for r in rowids[00:20] ] assignments1 = [ infinite_mixture.simulate(r, [1])[1] for r in rowids[20:40] ] assignments2 = [ infinite_mixture.simulate(r, [1])[1] for r in rowids[40:60] ] mode0 = Counter(assignments0).most_common(1)[0][0] mode1 = Counter(assignments1).most_common(1)[0][0] mode2 = Counter(assignments2).most_common(1)[0][0] assert sum(a == mode0 for a in assignments0) > int(0.95 * len(assignments0)) assert sum(a == mode1 for a in assignments1) > int(0.95 * len(assignments1)) assert sum(a == mode2 for a in assignments2) > int(0.95 * len(assignments2))
def test_crosscat_three_component_cpp__ci_(): prng = get_prng(12) integration = pytest.config.getoption('--integration') view = FlexibleRowMixture(cgpm_row_divide=CRP([1], [], rng=prng), cgpm_components_base=Product( cgpms=[Normal([0], [], rng=prng)], rng=prng), rng=prng) crosscat = Product(cgpms=[view], rng=prng) def func_inference(crosscat): n_step = 1000 if integration else 1 synthesizer = GibbsCrossCat(crosscat) synthesizer.transition_structure_cpp(N=n_step) synthesizer.transition_hypers_distributions() synthesizer.transition_hypers_row_divide() return synthesizer run_crosscat_test(crosscat, func_inference, integration, prng)
def test_flexible_array_observe(): prng = get_prng(2) component0 = Normal([0], [], hypers={'m':0}, rng=prng) indexer_0 = 128 cgpm_array_1 = FlexibleArray( cgpm_base=component0, indexer=indexer_0, rng=prng) # Make observations into cell 10. cgpm_array_1.observe(100, {0:10000}, inputs={indexer_0:10}) cgpm_array_1.observe(101, {0:10000}, inputs={indexer_0:10}) cgpm_array_1.observe(102, {0:10000}, inputs={indexer_0:10}) cgpm_array_1.observe(103, {0:10000}, inputs={indexer_0:10}) cgpm_array_1.observe(104, {0:10000}, inputs={indexer_0:10}) cgpm_array_1.observe(105, {0:10000}, inputs={indexer_0:10}) # Simulate from cell 10 (high mean). samples = cgpm_array_1.simulate(None, [0], inputs={indexer_0: 10}, N=100) assert len([s for s in samples if s[0] > 1000]) > int(.9*len(samples)) # Simulate from cell 0 (zero mean). samples = cgpm_array_1.simulate(None, [0], inputs={indexer_0: 0}, N=100) assert len([s for s in samples if s[0] > 1000]) == 0 assert len([s for s in samples if -10 < s[0] < 10]) > int(.9*len(samples))
def test_transition_rows_fixed_mixture(): prng = get_prng(2) component0 = Product([ Normal([0], [], hypers={'m':1000}, rng=prng), Normal([1], [], hypers={'m':0}, rng=prng) ], rng=prng) component1 = Product([ Normal([0], [], hypers={'m':-1000}, rng=prng), Normal([1], [], hypers={'m':1000}, rng=prng) ], rng=prng) component2 = Product([ Normal([0], [], hypers={'m':0}, rng=prng), Normal([1], [], hypers={'m':-100}, rng=prng) ], rng=prng) cgpm_row_divide = Categorical([2], [], distargs={'k':3}, rng=prng) finite_mixture = FiniteRowMixture( cgpm_row_divide=cgpm_row_divide, cgpm_components=[component0, component1, component2], rng=prng) # For component 0. finite_mixture.observe(0, {0:1000, 1:0, 2:0}) finite_mixture.observe(1, {0:990, 1:-10, 2:0}) # For component 1. finite_mixture.observe(2, {0:-1000, 1:1000, 2:0}) finite_mixture.observe(3, {0:-990, 1:990, 2:0}) # For component 2. finite_mixture.observe(4, {0:0, 1:-1000, 2:0}) finite_mixture.observe(5, {0:10, 1:-990, 2:0}) # Confirm all rows in component 0. assert finite_mixture.simulate(0, [2]) == {2:0} assert finite_mixture.simulate(1, [2]) == {2:0} assert finite_mixture.simulate(2, [2]) == {2:0} assert finite_mixture.simulate(3, [2]) == {2:0} assert finite_mixture.simulate(4, [2]) == {2:0} assert finite_mixture.simulate(5, [2]) == {2:0} # Run transitions for _i in xrange(10): for rowid in range(6): transition_rows(finite_mixture, rowid, prng) # Confirm all rows in correct components. assert finite_mixture.simulate(0, [2]) == {2:0} assert finite_mixture.simulate(1, [2]) == {2:0} assert finite_mixture.simulate(2, [2]) == {2:1} assert finite_mixture.simulate(3, [2]) == {2:1} assert finite_mixture.simulate(4, [2]) == {2:2} assert finite_mixture.simulate(5, [2]) == {2:2}
def test_product_mixture_walk(): prng = get_prng(2) component_base = Product([ Poisson([0], [], hypers={ 'a': 10, 'b': 1 }, rng=prng), Normal([1], [], hypers={'m': 100}, rng=prng) ], rng=prng) cgpm_row_divide = CRP([2], [], rng=prng) infinite_mixture = FlexibleRowMixture(cgpm_row_divide=cgpm_row_divide, cgpm_components_base=component_base, rng=prng) # Only the base CGPMs in the flexible mixture. cgpm_poisson = get_cgpms_by_output_index(infinite_mixture, 0) cgpm_normal = get_cgpms_by_output_index(infinite_mixture, 1) cgpm_crp = get_cgpms_by_output_index(infinite_mixture, 2) assert cgpm_poisson == [component_base.cgpms[0]] assert cgpm_normal == [component_base.cgpms[1]] assert cgpm_crp == [cgpm_row_divide] infinite_mixture.observe(0, {0: 1}) # New CGPMs in the flexible CGPM after observing. cgpm_poisson = get_cgpms_by_output_index(infinite_mixture, 0) cgpm_normal = get_cgpms_by_output_index(infinite_mixture, 1) assert len(cgpm_poisson) == len(cgpm_normal) == 2 assert [cgpm_poisson[-1]] == [component_base.cgpms[0]] assert [cgpm_normal[-1]] == [component_base.cgpms[1]] assert cgpm_poisson[0].N == 1 assert cgpm_normal[0].N == 0 cgpm_crp = get_cgpms_by_output_index(infinite_mixture, 2) assert len(cgpm_crp) == 1 assert cgpm_crp[0].N == 1 assert cgpm_crp[0].data[0] == 0 # Misc. errors, no such output. with pytest.raises(Exception): get_cgpms_by_output_index(infinite_mixture, -1)
def test_product_mixture_constraints(): prng = get_prng(2) component0 = Product([ Normal([0], [], hypers={'m':1000}, rng=prng), Normal([1], [], hypers={'m':1000}, rng=prng) ], rng=prng) component1 = Product([ Normal([0], [], hypers={'m':-1000}, rng=prng), Normal([1], [], hypers={'m':-1000}, rng=prng) ], rng=prng) component2 = Product([ Normal([0], [], hypers={'m':0}, rng=prng), Normal([1], [], hypers={'m':0}, rng=prng) ], rng=prng) cgpm_row_divide = Categorical([2], [], distargs={'k':3}, rng=prng) finite_mixture = FiniteRowMixture( cgpm_row_divide=cgpm_row_divide, cgpm_components=[component0, component1, component2], rng=prng) def run_mixture_tests(mixture): N = 100 # Simulate from component 1. samples = mixture.simulate(None, [0], constraints={2:1}, N=N) assert len([s for s in samples if -1100 < s[0] < -900]) > int(.9*N) # Simulate from random components. samples = mixture.simulate(None, [0], N=N) assert len([s for s in samples if -900 < s[0] < -1100]) < int(.33*N) # Simulate (implicitly) from component 0. samples = mixture.simulate(None, [1,2], constraints={0:1000}, N=N) assert len([s for s in samples if 900 < s[1] < 1100]) > int(.9*N) assert len([s for s in samples if s[2] == 0]) == N # Run tests on finite_mixture. run_mixture_tests(finite_mixture) # Run tests after to/from metadata conversion. metadata = finite_mixture.to_metadata() finite_mixture2 = FiniteRowMixture.from_metadata(metadata, prng) run_mixture_tests(finite_mixture2)