def test_score_derivative_runs(root, rows, **unused): with loom.query.get_server(root, debug=True) as server: rows = load_rows(rows) target_row = protobuf_to_data_row(rows[0].diff) score_rows = [protobuf_to_data_row(r.diff) for r in rows[:2]] results = server.score_derivative(target_row, score_rows) assert len(results) == len(score_rows)
def test_entropy(root, ingest, **unused): sample_count = 1000 with loom.query.get_server(root) as server: rows = load_rows(ingest['rows']) rows = rows[:4] rows = [loom.query.protobuf_to_data_row(row.diff) for row in rows] rows = [[None] * len(rows[0])] + rows for row in rows: to_sample = [val is None for val in row] samples = server.sample( conditioning_row=row, to_sample=to_sample, sample_count=sample_count) base_score = server.score(row) scores = numpy.array(list(server.batch_score(samples))) py_estimate = loom.query.get_estimate(base_score - scores) feature_set = frozenset(i for i, ts in enumerate(to_sample) if ts) cpp_estimate = server.entropy( row_sets=[feature_set], col_sets=[feature_set], conditioning_row=row, sample_count=sample_count)[feature_set] assert_estimate_close(cpp_estimate, py_estimate)
def test_samples_match_scores(root, rows, **unused): rows = load_rows(rows) rows = rows[::len(rows) / 5] with tempdir(): loom.config.config_dump({'seed': SEED}, 'config.pb.gz') with loom.query.get_server(root, 'config.pb.gz', debug=True) as server: for row in rows: _check_marginal_samples_match_scores(server, row, 0)
def test_one_to_one(rows, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): seed = 12345 rows_out = os.path.abspath('rows_out.pbs.gz') loom.runner.shuffle( rows_in=rows, rows_out=rows_out, seed=seed) assert_found(rows_out) original = load_rows(rows) shuffled = load_rows(rows_out) assert_equal(len(shuffled), len(original)) assert_not_equal(shuffled, original) actual = sorted(shuffled, key=lambda row: row.id) expected = sorted(original, key=lambda row: row.id) assert_list_equal(expected, actual)
def test_score_derivative_against_existing_runs(root, rows, **unused): with loom.query.get_server(root, debug=True) as server: rows = load_rows(rows) target_row = protobuf_to_data_row(rows[0].diff) results = server.score_derivative(target_row, score_rows=None) assert len(rows) == len(results) results = server.score_derivative(target_row, score_rows=None, row_limit=1) assert len(results) == 1
def test_export_rows(encoding, rows, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): rows_csv = os.path.abspath('rows_csv') rows_pbs = os.path.abspath('rows.pbs.gz') loom.format.export_rows(encoding_in=encoding, rows_in=rows, rows_csv_out=rows_csv, chunk_size=51) assert_found(rows_csv) assert_found(os.path.join(rows_csv, 'rows.0.csv.gz')) loom.format.import_rows(encoding_in=encoding, rows_csv_in=rows_csv, rows_out=rows_pbs) assert_found(rows_pbs) expected = load_rows(rows) actual = load_rows(rows_pbs) assert_equal(len(actual), len(expected)) actual.sort(key=lambda row: row.id) expected.sort(key=lambda row: row.id) expected_data = [row.diff for row in expected] actual_data = [row.diff for row in actual] assert_close(actual_data, expected_data)
def test_score_derivative_against_existing_runs(root, rows, **unused): with loom.query.get_server(root, debug=True) as server: rows = load_rows(rows) target_row = protobuf_to_data_row(rows[0].diff) results = server.score_derivative( target_row, score_rows=None) assert len(rows) == len(results) results = server.score_derivative( target_row, score_rows=None, row_limit=1) assert len(results) == 1
def test_export_rows(encoding, rows, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): rows_csv = os.path.abspath('rows_csv') rows_pbs = os.path.abspath('rows.pbs.gz') loom.format.export_rows( encoding_in=encoding, rows_in=rows, rows_csv_out=rows_csv, chunk_size=51) assert_found(rows_csv) assert_found(os.path.join(rows_csv, 'rows.0.csv.gz')) loom.format.import_rows( encoding_in=encoding, rows_csv_in=rows_csv, rows_out=rows_pbs) assert_found(rows_pbs) expected = load_rows(rows) actual = load_rows(rows_pbs) assert_equal(len(actual), len(expected)) actual.sort(key=lambda row: row.id) expected.sort(key=lambda row: row.id) expected_data = [row.diff for row in expected] actual_data = [row.diff for row in actual] assert_close(actual_data, expected_data)
def _test_entropy(name, sample_count): paths = loom.store.get_paths(name) with loom.query.get_server(paths['root']) as server: rows = load_rows(paths['ingest']['rows']) rows = rows[:4] rows = [loom.query.protobuf_to_data_row(row.diff) for row in rows] rows = [[None] * len(rows[0])] + rows for row in rows: to_sample = [val is None for val in row] samples = server.sample(conditioning_row=row, to_sample=to_sample, sample_count=sample_count) base_score = server.score(row) scores = numpy.array(list(server.batch_score(samples))) py_estimate = loom.query.get_estimate(base_score - scores) feature_set = frozenset(i for i, ts in enumerate(to_sample) if ts) cpp_estimate = server.entropy( row_sets=[feature_set], col_sets=[feature_set], conditioning_row=row, sample_count=sample_count)[feature_set] assert_estimate_close(cpp_estimate, py_estimate)
def get_example_requests(model, rows, query_type='mixed'): assert query_type in ['sample', 'score', 'mixed'] cross_cat = CrossCat() with open_compressed(model, 'rb') as f: cross_cat.ParseFromString(f.read()) feature_count = sum(len(kind.featureids) for kind in cross_cat.kinds) featureids = range(feature_count) nontrivials = [True] * feature_count for kind in cross_cat.kinds: fs = iter(kind.featureids) for model in loom.schema.MODELS.iterkeys(): for shared in getattr(kind.product_model, model): f = fs.next() if model == 'dd': if len(shared.alphas) == 0: nontrivials[f] = False elif model == 'dpd': if len(shared.betas) == 0: nontrivials[f] = False all_observed = nontrivials[:] none_observed = [False] * feature_count observeds = [] observeds.append(all_observed) for f, nontrivial in izip(featureids, nontrivials): if nontrivial: observed = all_observed[:] observed[f] = False observeds.append(observed) for f in featureids: observed = [ nontrivial and sample_bernoulli(0.5) for nontrivial in nontrivials ] observeds.append(observed) for f, nontrivial in izip(featureids, nontrivials): if nontrivial: observed = none_observed[:] observed[f] = True observeds.append(observed) observeds.append(none_observed) requests = [] for i, observed in enumerate(observeds): request = Query.Request() request.id = "example-{}".format(i) if query_type in ['sample', 'mixed']: set_diff(request.sample.data, none_observed) request.sample.to_sample.sparsity = DENSE request.sample.to_sample.dense[:] = observed request.sample.sample_count = 1 if query_type in ['score', 'mixed']: set_diff(request.score.data, none_observed) requests.append(request) for row in load_rows(rows)[:20]: i += 1 request = Query.Request() request.id = "example-{}".format(i) if query_type in ['sample', 'mixed']: request.sample.sample_count = 1 request.sample.data.MergeFrom(row.diff) request.sample.to_sample.sparsity = DENSE conditions = izip(nontrivials, row.diff.pos.observed.dense) to_sample = [ nontrivial and not is_observed for nontrivial, is_observed in conditions ] set_observed(request.sample.to_sample, to_sample) if query_type in ['score', 'mixed']: request.score.data.MergeFrom(row.diff) requests.append(request) return requests
def test_samples_match_scores(root, rows, **unused): rows = load_rows(rows) rows = rows[::len(rows) / 5] with loom.query.get_server(root, debug=True) as server: for row in rows: _check_marginal_samples_match_scores(server, row, 0)