def _test_multinomial_goodness_of_fit(dim): thresh = 1e-3 sample_count = int(1e5) probs = numpy.random.dirichlet([1] * dim) counts = numpy.random.multinomial(sample_count, probs) p_good = multinomial_goodness_of_fit(probs, counts, sample_count) assert_greater(p_good, thresh) unif_counts = numpy.random.multinomial(sample_count, [1. / dim] * dim) p_bad = multinomial_goodness_of_fit(probs, unif_counts, sample_count) assert_less(p_bad, thresh)
def _test_multinomial_goodness_of_fit(dim): seed_all(0) thresh = 1e-3 sample_count = int(1e5) probs = numpy.random.dirichlet([1] * dim) counts = numpy.random.multinomial(sample_count, probs) p_good = multinomial_goodness_of_fit(probs, counts, sample_count) assert_greater(p_good, thresh) unif_counts = numpy.random.multinomial(sample_count, [1. / dim] * dim) p_bad = multinomial_goodness_of_fit(probs, unif_counts, sample_count) assert_less(p_bad, thresh)
def assert_counts_match_probs(counts, probs, tol=1e-3): ''' Check goodness of fit of observed counts to predicted probabilities using Pearson's chi-squared test. Inputs: - counts : key -> int - probs : key -> float ''' keys = counts.keys() probs = [probs[key] for key in keys] counts = [counts[key] for key in keys] total_count = sum(counts) print 'EXPECT\tACTUAL\tVALUE' for prob, count, key in sorted(izip(probs, counts, keys), reverse=True): expect = prob * total_count print '{:0.1f}\t{}\t{}'.format(expect, count, key) gof = multinomial_goodness_of_fit(probs, counts, total_count) print 'goodness of fit = {}'.format(gof) assert gof > tol, 'failed with goodness of fit {}'.format(gof)
def assert_counts_match_probs(counts, probs, tol=1e-3): ''' Check goodness of fit of observed counts to predicted probabilities using Pearson's chi-squared test. Inputs: - counts : key -> int - probs : key -> float ''' keys = counts.keys() probs = [probs[key] for key in keys] counts = [counts[key] for key in keys] total_count = sum(counts) print 'EXPECT\tACTUAL\tVALUE' for prob, count, key in sorted(izip(probs, counts, keys), reverse=True): expect = prob * total_count print '{:0.1f}\t{}\t{}'.format(expect, count, key) gof = multinomial_goodness_of_fit(probs, counts, total_count) print 'goodness of fit = {}'.format(gof) assert gof > tol, 'failed with goodness of fit {}'.format(gof)
def _test_dataset_config(casename, object_count, feature_count, config_name, model_name, fixed_model_names, rows_name, config, debug): dataset = {'model': model_name, 'rows': rows_name, 'config': config_name} samples = generate_samples(casename, dataset, debug) fixed_hyper_samples = [] for fixed_model_name in fixed_model_names: fixed_dataset = dataset.copy() fixed_dataset['model'] = fixed_model_name fs = generate_samples(None, fixed_dataset, debug) fixed_hyper_samples.append(fs) sample_count = config['posterior_enum']['sample_count'] counts_dict = {} scores_dict = {} actual_count = 0 for sample, score in samples: actual_count += 1 add_sample(sample, score, counts_dict, scores_dict) assert_equal(actual_count, sample_count) if fixed_hyper_samples: latents, scores_dict = process_fixed_samples(fixed_hyper_samples, scores_dict.keys()) useable_count = sum([counts_dict[lat] for lat in latents]) if useable_count < sample_count: LOG( 'Warn', casename, 'scores found for {} / {} samples'.format( useable_count, sample_count)) sample_count = useable_count else: latents = scores_dict.keys() actual_latent_count = len(latents) infer_kinds = (config['kernels']['kind']['iterations'] > 0) if infer_kinds: expected_latent_count = count_crosscats(object_count, feature_count) else: expected_latent_count = BELL_NUMBERS[object_count] assert actual_latent_count <= expected_latent_count, 'programmer error' if actual_latent_count < expected_latent_count: LOG( 'Warn', casename, 'found only {} / {} latents'.format(actual_latent_count, expected_latent_count)) counts = numpy.array([counts_dict[key] for key in latents]) scores = numpy.array([scores_dict[key] for key in latents]) probs = scores_to_probs(scores) highest_by_prob = numpy.argsort(probs)[::-1][:TRUNCATE_COUNT] is_accurate = lambda p: sample_count * p * (1 - p) >= 1 highest_by_prob = [i for i in highest_by_prob if is_accurate(probs[i])] highest_by_count = numpy.argsort(counts)[::-1][:TRUNCATE_COUNT] highest = list(set(highest_by_prob) | set(highest_by_count)) truncated = len(highest_by_prob) < len(probs) if len(highest_by_prob) < 1: LOG('Warn', casename, 'test is inaccurate; use more samples') return None goodness_of_fit = multinomial_goodness_of_fit(probs[highest_by_prob], counts[highest_by_prob], total_count=sample_count, truncated=truncated) comment = 'goodness of fit = {:0.3g}'.format(goodness_of_fit) if goodness_of_fit > MIN_GOODNESS_OF_FIT: LOG('Pass', casename, comment) return None else: print 'EXPECT\tACTUAL\tCHI\tVALUE' lines = [(probs[i], counts[i], latents[i]) for i in highest] for prob, count, latent in sorted(lines, reverse=True): expect = prob * sample_count chi = (count - expect) * expect**-0.5 pretty = pretty_latent(latent) print '{:0.1f}\t{}\t{:+0.1f}\t{}'.format(expect, count, chi, pretty) return LOG('Fail', casename, comment)
def _test_dataset_config( casename, object_count, feature_count, config_name, model_name, fixed_model_names, rows_name, config, debug): dataset = {'model': model_name, 'rows': rows_name, 'config': config_name} samples = generate_samples(casename, dataset, debug) fixed_hyper_samples = [] for fixed_model_name in fixed_model_names: fixed_dataset = dataset.copy() fixed_dataset['model'] = fixed_model_name fs = generate_samples(None, fixed_dataset, debug) fixed_hyper_samples.append(fs) sample_count = config['posterior_enum']['sample_count'] counts_dict = {} scores_dict = {} actual_count = 0 for sample, score in samples: actual_count += 1 add_sample(sample, score, counts_dict, scores_dict) assert_equal(actual_count, sample_count) if fixed_hyper_samples: latents, scores_dict = process_fixed_samples( fixed_hyper_samples, scores_dict.keys()) useable_count = sum([counts_dict[lat] for lat in latents]) if useable_count < sample_count: LOG('Warn', casename, 'scores found for {} / {} samples'.format( useable_count, sample_count)) sample_count = useable_count else: latents = scores_dict.keys() actual_latent_count = len(latents) infer_kinds = (config['kernels']['kind']['iterations'] > 0) if infer_kinds: expected_latent_count = count_crosscats(object_count, feature_count) else: expected_latent_count = BELL_NUMBERS[object_count] assert actual_latent_count <= expected_latent_count, 'programmer error' if actual_latent_count < expected_latent_count: LOG('Warn', casename, 'found only {} / {} latents'.format( actual_latent_count, expected_latent_count)) counts = numpy.array([counts_dict[key] for key in latents]) scores = numpy.array([scores_dict[key] for key in latents]) probs = scores_to_probs(scores) highest_by_prob = numpy.argsort(probs)[::-1][:TRUNCATE_COUNT] is_accurate = lambda p: sample_count * p * (1 - p) >= 1 highest_by_prob = [i for i in highest_by_prob if is_accurate(probs[i])] highest_by_count = numpy.argsort(counts)[::-1][:TRUNCATE_COUNT] highest = list(set(highest_by_prob) | set(highest_by_count)) truncated = len(highest_by_prob) < len(probs) if len(highest_by_prob) < 1: LOG('Warn', casename, 'test is inaccurate; use more samples') return None goodness_of_fit = multinomial_goodness_of_fit( probs[highest_by_prob], counts[highest_by_prob], total_count=sample_count, truncated=truncated) comment = 'goodness of fit = {:0.3g}'.format(goodness_of_fit) if goodness_of_fit > MIN_GOODNESS_OF_FIT: LOG('Pass', casename, comment) return None else: print 'EXPECT\tACTUAL\tCHI\tVALUE' lines = [(probs[i], counts[i], latents[i]) for i in highest] for prob, count, latent in sorted(lines, reverse=True): expect = prob * sample_count chi = (count - expect) * expect ** -0.5 pretty = pretty_latent(latent) print '{:0.1f}\t{}\t{:+0.1f}\t{}'.format( expect, count, chi, pretty) return LOG('Fail', casename, comment)