def _test_multinomial_goodness_of_fit(dim): seed_all(0) sample_count = int(1e5) probs = numpy.random.dirichlet([1] * dim) counts = numpy.random.multinomial(sample_count, probs) p_good = multinomial_goodness_of_fit(probs, counts, sample_count) assert_greater(p_good, TEST_FAILURE_RATE) unif_counts = numpy.random.multinomial(sample_count, [1. / dim] * dim) p_bad = multinomial_goodness_of_fit(probs, unif_counts, sample_count) assert_less(p_bad, TEST_FAILURE_RATE)
def test_multinomial_goodness_of_fit(self): random.seed(0) numpy.random.seed(0) for dim in range(2, 20): sample_count = int(1e5) probs = numpy.random.dirichlet([1] * dim) counts = numpy.random.multinomial(sample_count, probs) p_good = multinomial_goodness_of_fit(probs, counts, sample_count) self.assertGreater(p_good, TEST_FAILURE_RATE) unif = [1 / dim] * dim unif_counts = numpy.random.multinomial(sample_count, unif) p_bad = multinomial_goodness_of_fit(probs, unif_counts, sample_count) self.assertLess(p_bad, TEST_FAILURE_RATE)
def test_sample_from_probs_gof(size): set_random_seed(size) probs = np.exp(2 * np.random.random(size)).astype(np.float32) counts = np.zeros(size, dtype=np.int32) num_samples = 2000 * size for _ in range(num_samples): counts[sample_from_probs(probs)] += 1 probs /= probs.sum() # Normalize afterwards. print(counts) print(probs * num_samples) gof = multinomial_goodness_of_fit(probs, counts, num_samples, plot=True) assert 1e-2 < gof
def test_sample_from_probs2_gof(size): set_random_seed(size) probs = np.exp(2 * np.random.random(size)).astype(np.float32) counts = np.zeros(size, dtype=np.int32) num_samples = 2000 * size probs2 = np.tile(probs, (num_samples, 1)) samples = sample_from_probs2(probs2) probs /= probs.sum() # Normalize afterwards. counts = np.bincount(samples, minlength=size) print(counts) print(probs * num_samples) gof = multinomial_goodness_of_fit(probs, counts, num_samples, plot=True) assert 1e-2 < gof
def validate_gof(N, V, C, M, server, conditional): # Generate samples. expected = C**V num_samples = 1000 * expected ones = np.ones(V, dtype=np.int8) if conditional: cond_data = server.sample(1, ones)[0, :] else: cond_data = server.make_zero_row() samples = server.sample(num_samples, ones, cond_data) logprobs = server.logprob(samples + cond_data[np.newaxis, :]) counts = {} probs = {} for sample, logprob in zip(samples, logprobs): key = tuple(sample) if key in counts: counts[key] += 1 else: counts[key] = 1 probs[key] = np.exp(logprob) assert len(counts) == expected # Check accuracy using Pearson's chi-squared test. keys = sorted(counts.keys(), key=lambda key: -probs[key]) counts = np.array([counts[k] for k in keys], dtype=np.int32) probs = np.array([probs[k] for k in keys]) probs /= probs.sum() # Truncate to avoid low-precision. truncated = False valid = (probs * num_samples > 20) if not valid.all(): T = valid.argmin() T = max(8, T) # Avoid truncating too much probs = probs[:T] counts = counts[:T] truncated = True gof = multinomial_goodness_of_fit(probs, counts, num_samples, plot=True, truncated=truncated) assert 1e-2 < gof
def test_assignment_sampler_gof(N, V, C, M): config = make_config(model_num_clusters=M) K = V * (V - 1) // 2 dataset = generate_dataset(num_rows=N, num_cols=V, num_cats=C) table = dataset['table'] tree_prior = np.exp(np.random.random(K), dtype=np.float32) trainer = TreeCatTrainer(table, tree_prior, config) print('Data:') print(dataset['table'].data) # Add all rows. set_random_seed(1) for row_id in range(N): trainer.add_row(row_id) # Collect samples. num_samples = 500 * M**(N * V) counts = {} logprobs = {} for _ in range(num_samples): for row_id in range(N): # This is a single-site Gibbs sampler. trainer.remove_row(row_id) trainer.add_row(row_id) key = hash_assignments(trainer._assignments) if key in counts: counts[key] += 1 else: counts[key] = 1 logprobs[key] = trainer.logprob() assert len(counts) == M**(N * V) # Check accuracy using Pearson's chi-squared test. keys = sorted(counts.keys()) counts = np.array([counts[k] for k in keys], dtype=np.int32) probs = np.exp(np.array([logprobs[k] for k in keys])) probs /= probs.sum() print('Actual\tExpected\tAssignment') for count, prob, key in zip(counts, probs, keys): print('{:}\t{:0.1f}\t{}'.format(count, prob * num_samples, key)) gof = multinomial_goodness_of_fit(probs, counts, num_samples, plot=True) assert 1e-2 < gof
def test_sample_tree_gof(num_edges): set_random_seed(num_edges) E = num_edges V = 1 + E grid = make_complete_graph(V) K = grid.shape[1] edge_logits = np.random.random([K]) edge_probs = np.exp(edge_logits) edge_probs_dict = {(v1, v2): edge_probs[k] for k, v1, v2 in grid.T} # Generate many samples via MCMC. num_samples = 30 * NUM_SPANNING_TREES[V] counts = defaultdict(lambda: 0) edges = [(v, v + 1) for v in range(V - 1)] for _ in range(num_samples): edges = sample_tree(grid, edge_logits, edges) counts[tuple(edges)] += 1 assert len(counts) == NUM_SPANNING_TREES[V] # Check accuracy using Pearson's chi-squared test. keys = counts.keys() counts = np.array([counts[key] for key in keys]) probs = np.array( [np.prod([edge_probs_dict[edge] for edge in key]) for key in keys]) probs /= probs.sum() # Possibly truncate. T = 100 truncated = False if len(counts) > T: counts = counts[:T] probs = probs[:T] truncated = True gof = multinomial_goodness_of_fit(probs, counts, num_samples, plot=True, truncated=truncated) assert 1e-2 < gof
def assert_counts_match_probs(counts, probs, tol=1e-3): ''' Check goodness of fit of observed counts to predicted probabilities using Pearson's chi-squared test. Inputs: - counts : key -> int - probs : key -> float ''' keys = counts.keys() probs = [probs[key] for key in keys] counts = [counts[key] for key in keys] total_count = sum(counts) print 'EXPECT\tACTUAL\tVALUE' for prob, count, key in sorted(izip(probs, counts, keys), reverse=True): expect = prob * total_count print '{:0.1f}\t{}\t{}'.format(expect, count, key) gof = multinomial_goodness_of_fit(probs, counts, total_count) print 'goodness of fit = {}'.format(gof) assert gof > tol, 'failed with goodness of fit {}'.format(gof)
def _test_dataset_config( casename, object_count, feature_count, config_name, model_name, fixed_model_names, rows_name, config, debug): dataset = {'model': model_name, 'rows': rows_name, 'config': config_name} samples = generate_samples(casename, dataset, debug) fixed_hyper_samples = [] for fixed_model_name in fixed_model_names: fixed_dataset = dataset.copy() fixed_dataset['model'] = fixed_model_name fs = generate_samples(None, fixed_dataset, debug) fixed_hyper_samples.append(fs) sample_count = config['posterior_enum']['sample_count'] counts_dict = {} scores_dict = {} actual_count = 0 for sample, score in samples: actual_count += 1 add_sample(sample, score, counts_dict, scores_dict) assert_equal(actual_count, sample_count) if fixed_hyper_samples: latents, scores_dict = process_fixed_samples( fixed_hyper_samples, scores_dict.keys()) useable_count = sum([counts_dict[lat] for lat in latents]) if useable_count < sample_count: LOG('Warn', casename, 'scores found for {} / {} samples'.format( useable_count, sample_count)) sample_count = useable_count else: latents = scores_dict.keys() actual_latent_count = len(latents) infer_kinds = (config['kernels']['kind']['iterations'] > 0) if infer_kinds: expected_latent_count = count_crosscats(object_count, feature_count) else: expected_latent_count = BELL_NUMBERS[object_count] assert actual_latent_count <= expected_latent_count, 'programmer error' if actual_latent_count < expected_latent_count: LOG('Warn', casename, 'found only {} / {} latents'.format( actual_latent_count, expected_latent_count)) counts = numpy.array([counts_dict[key] for key in latents]) scores = numpy.array([scores_dict[key] for key in latents]) probs = scores_to_probs(scores) highest_by_prob = numpy.argsort(probs)[::-1][:TRUNCATE_COUNT] is_accurate = lambda p: sample_count * p * (1 - p) >= 1 highest_by_prob = [i for i in highest_by_prob if is_accurate(probs[i])] highest_by_count = numpy.argsort(counts)[::-1][:TRUNCATE_COUNT] highest = list(set(highest_by_prob) | set(highest_by_count)) truncated = len(highest_by_prob) < len(probs) if len(highest_by_prob) < 1: LOG('Warn', casename, 'test is inaccurate; use more samples') return None goodness_of_fit = multinomial_goodness_of_fit( probs[highest_by_prob], counts[highest_by_prob], total_count=sample_count, truncated=truncated) comment = 'goodness of fit = {:0.3g}'.format(goodness_of_fit) if goodness_of_fit > MIN_GOODNESS_OF_FIT: LOG('Pass', casename, comment) return None else: print 'EXPECT\tACTUAL\tCHI\tVALUE' lines = [(probs[i], counts[i], latents[i]) for i in highest] for prob, count, latent in sorted(lines, reverse=True): expect = prob * sample_count chi = (count - expect) * expect ** -0.5 pretty = pretty_latent(latent) print '{:0.1f}\t{}\t{:+0.1f}\t{}'.format( expect, count, chi, pretty) return LOG('Fail', casename, comment)
def _test_dataset_config(casename, object_count, feature_count, config_name, model_name, fixed_model_names, rows_name, config, debug): dataset = {'model': model_name, 'rows': rows_name, 'config': config_name} samples = generate_samples(casename, dataset, debug) fixed_hyper_samples = [] for fixed_model_name in fixed_model_names: fixed_dataset = dataset.copy() fixed_dataset['model'] = fixed_model_name fs = generate_samples(None, fixed_dataset, debug) fixed_hyper_samples.append(fs) sample_count = config['posterior_enum']['sample_count'] counts_dict = {} scores_dict = {} actual_count = 0 for sample, score in samples: actual_count += 1 add_sample(sample, score, counts_dict, scores_dict) assert_equal(actual_count, sample_count) if fixed_hyper_samples: latents, scores_dict = process_fixed_samples(fixed_hyper_samples, scores_dict.keys()) useable_count = sum([counts_dict[lat] for lat in latents]) if useable_count < sample_count: LOG( 'Warn', casename, 'scores found for {} / {} samples'.format( useable_count, sample_count)) sample_count = useable_count else: latents = scores_dict.keys() actual_latent_count = len(latents) infer_kinds = (config['kernels']['kind']['iterations'] > 0) if infer_kinds: expected_latent_count = count_crosscats(object_count, feature_count) else: expected_latent_count = BELL_NUMBERS[object_count] assert actual_latent_count <= expected_latent_count, 'programmer error' if actual_latent_count < expected_latent_count: LOG( 'Warn', casename, 'found only {} / {} latents'.format(actual_latent_count, expected_latent_count)) counts = numpy.array([counts_dict[key] for key in latents]) scores = numpy.array([scores_dict[key] for key in latents]) probs = scores_to_probs(scores) highest_by_prob = numpy.argsort(probs)[::-1][:TRUNCATE_COUNT] is_accurate = lambda p: sample_count * p * (1 - p) >= 1 highest_by_prob = [i for i in highest_by_prob if is_accurate(probs[i])] highest_by_count = numpy.argsort(counts)[::-1][:TRUNCATE_COUNT] highest = list(set(highest_by_prob) | set(highest_by_count)) truncated = len(highest_by_prob) < len(probs) if len(highest_by_prob) < 1: LOG('Warn', casename, 'test is inaccurate; use more samples') return None goodness_of_fit = multinomial_goodness_of_fit(probs[highest_by_prob], counts[highest_by_prob], total_count=sample_count, truncated=truncated) comment = 'goodness of fit = {:0.3g}'.format(goodness_of_fit) if goodness_of_fit > MIN_GOODNESS_OF_FIT: LOG('Pass', casename, comment) return None else: print 'EXPECT\tACTUAL\tCHI\tVALUE' lines = [(probs[i], counts[i], latents[i]) for i in highest] for prob, count, latent in sorted(lines, reverse=True): expect = prob * sample_count chi = (count - expect) * expect**-0.5 pretty = pretty_latent(latent) print '{:0.1f}\t{}\t{:+0.1f}\t{}'.format(expect, count, chi, pretty) return LOG('Fail', casename, comment)