Beispiel #1
0
def test_score_derivative_runs(root, rows, **unused):
    with loom.query.get_server(root, debug=True) as server:
        rows = load_rows(rows)
        target_row = protobuf_to_data_row(rows[0].diff)
        score_rows = [protobuf_to_data_row(r.diff) for r in rows[:2]]
        results = server.score_derivative(target_row, score_rows)
        assert len(results) == len(score_rows)
Beispiel #2
0
def test_entropy(root, ingest, **unused):
    sample_count = 1000
    with loom.query.get_server(root) as server:
        rows = load_rows(ingest['rows'])
        rows = rows[:4]
        rows = [loom.query.protobuf_to_data_row(row.diff) for row in rows]
        rows = [[None] * len(rows[0])] + rows
        for row in rows:
            to_sample = [val is None for val in row]
            samples = server.sample(
                conditioning_row=row,
                to_sample=to_sample,
                sample_count=sample_count)
            base_score = server.score(row)
            scores = numpy.array(list(server.batch_score(samples)))
            py_estimate = loom.query.get_estimate(base_score - scores)

            feature_set = frozenset(i for i, ts in enumerate(to_sample) if ts)
            cpp_estimate = server.entropy(
                row_sets=[feature_set],
                col_sets=[feature_set],
                conditioning_row=row,
                sample_count=sample_count)[feature_set]

            assert_estimate_close(cpp_estimate, py_estimate)
Beispiel #3
0
def test_samples_match_scores(root, rows, **unused):
    rows = load_rows(rows)
    rows = rows[::len(rows) / 5]
    with tempdir():
        loom.config.config_dump({'seed': SEED}, 'config.pb.gz')
        with loom.query.get_server(root, 'config.pb.gz', debug=True) as server:
            for row in rows:
                _check_marginal_samples_match_scores(server, row, 0)
Beispiel #4
0
def test_samples_match_scores(root, rows, **unused):
    rows = load_rows(rows)
    rows = rows[::len(rows) / 5]
    with tempdir():
        loom.config.config_dump({'seed': SEED}, 'config.pb.gz')
        with loom.query.get_server(root, 'config.pb.gz', debug=True) as server:
            for row in rows:
                _check_marginal_samples_match_scores(server, row, 0)
Beispiel #5
0
def test_one_to_one(rows, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        seed = 12345
        rows_out = os.path.abspath('rows_out.pbs.gz')
        loom.runner.shuffle(
            rows_in=rows,
            rows_out=rows_out,
            seed=seed)
        assert_found(rows_out)

        original = load_rows(rows)
        shuffled = load_rows(rows_out)
        assert_equal(len(shuffled), len(original))
        assert_not_equal(shuffled, original)

        actual = sorted(shuffled, key=lambda row: row.id)
        expected = sorted(original, key=lambda row: row.id)
        assert_list_equal(expected, actual)
Beispiel #6
0
def test_score_derivative_against_existing_runs(root, rows, **unused):
    with loom.query.get_server(root, debug=True) as server:
        rows = load_rows(rows)
        target_row = protobuf_to_data_row(rows[0].diff)
        results = server.score_derivative(target_row, score_rows=None)
        assert len(rows) == len(results)
        results = server.score_derivative(target_row,
                                          score_rows=None,
                                          row_limit=1)
        assert len(results) == 1
Beispiel #7
0
def test_export_rows(encoding, rows, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        rows_csv = os.path.abspath('rows_csv')
        rows_pbs = os.path.abspath('rows.pbs.gz')
        loom.format.export_rows(encoding_in=encoding,
                                rows_in=rows,
                                rows_csv_out=rows_csv,
                                chunk_size=51)
        assert_found(rows_csv)
        assert_found(os.path.join(rows_csv, 'rows.0.csv.gz'))
        loom.format.import_rows(encoding_in=encoding,
                                rows_csv_in=rows_csv,
                                rows_out=rows_pbs)
        assert_found(rows_pbs)
        expected = load_rows(rows)
        actual = load_rows(rows_pbs)
        assert_equal(len(actual), len(expected))
        actual.sort(key=lambda row: row.id)
        expected.sort(key=lambda row: row.id)
        expected_data = [row.diff for row in expected]
        actual_data = [row.diff for row in actual]
        assert_close(actual_data, expected_data)
Beispiel #8
0
def test_score_derivative_against_existing_runs(root, rows, **unused):
    with loom.query.get_server(root, debug=True) as server:
        rows = load_rows(rows)
        target_row = protobuf_to_data_row(rows[0].diff)
        results = server.score_derivative(
            target_row,
            score_rows=None)
        assert len(rows) == len(results)
        results = server.score_derivative(
            target_row,
            score_rows=None,
            row_limit=1)
        assert len(results) == 1
Beispiel #9
0
def test_export_rows(encoding, rows, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        rows_csv = os.path.abspath('rows_csv')
        rows_pbs = os.path.abspath('rows.pbs.gz')
        loom.format.export_rows(
            encoding_in=encoding,
            rows_in=rows,
            rows_csv_out=rows_csv,
            chunk_size=51)
        assert_found(rows_csv)
        assert_found(os.path.join(rows_csv, 'rows.0.csv.gz'))
        loom.format.import_rows(
            encoding_in=encoding,
            rows_csv_in=rows_csv,
            rows_out=rows_pbs)
        assert_found(rows_pbs)
        expected = load_rows(rows)
        actual = load_rows(rows_pbs)
        assert_equal(len(actual), len(expected))
        actual.sort(key=lambda row: row.id)
        expected.sort(key=lambda row: row.id)
        expected_data = [row.diff for row in expected]
        actual_data = [row.diff for row in actual]
        assert_close(actual_data, expected_data)
Beispiel #10
0
def _test_entropy(name, sample_count):
    paths = loom.store.get_paths(name)
    with loom.query.get_server(paths['root']) as server:
        rows = load_rows(paths['ingest']['rows'])
        rows = rows[:4]
        rows = [loom.query.protobuf_to_data_row(row.diff) for row in rows]
        rows = [[None] * len(rows[0])] + rows
        for row in rows:
            to_sample = [val is None for val in row]
            samples = server.sample(conditioning_row=row,
                                    to_sample=to_sample,
                                    sample_count=sample_count)
            base_score = server.score(row)
            scores = numpy.array(list(server.batch_score(samples)))
            py_estimate = loom.query.get_estimate(base_score - scores)

            feature_set = frozenset(i for i, ts in enumerate(to_sample) if ts)
            cpp_estimate = server.entropy(
                row_sets=[feature_set],
                col_sets=[feature_set],
                conditioning_row=row,
                sample_count=sample_count)[feature_set]

            assert_estimate_close(cpp_estimate, py_estimate)
Beispiel #11
0
def get_example_requests(model, rows, query_type='mixed'):
    assert query_type in ['sample', 'score', 'mixed']
    cross_cat = CrossCat()
    with open_compressed(model, 'rb') as f:
        cross_cat.ParseFromString(f.read())
    feature_count = sum(len(kind.featureids) for kind in cross_cat.kinds)
    featureids = range(feature_count)

    nontrivials = [True] * feature_count
    for kind in cross_cat.kinds:
        fs = iter(kind.featureids)
        for model in loom.schema.MODELS.iterkeys():
            for shared in getattr(kind.product_model, model):
                f = fs.next()
                if model == 'dd':
                    if len(shared.alphas) == 0:
                        nontrivials[f] = False
                elif model == 'dpd':
                    if len(shared.betas) == 0:
                        nontrivials[f] = False
    all_observed = nontrivials[:]
    none_observed = [False] * feature_count

    observeds = []
    observeds.append(all_observed)
    for f, nontrivial in izip(featureids, nontrivials):
        if nontrivial:
            observed = all_observed[:]
            observed[f] = False
            observeds.append(observed)
    for f in featureids:
        observed = [
            nontrivial and sample_bernoulli(0.5)
            for nontrivial in nontrivials
        ]
        observeds.append(observed)
    for f, nontrivial in izip(featureids, nontrivials):
        if nontrivial:
            observed = none_observed[:]
            observed[f] = True
            observeds.append(observed)
    observeds.append(none_observed)

    requests = []
    for i, observed in enumerate(observeds):
        request = Query.Request()
        request.id = "example-{}".format(i)
        if query_type in ['sample', 'mixed']:
            set_diff(request.sample.data, none_observed)
            request.sample.to_sample.sparsity = DENSE
            request.sample.to_sample.dense[:] = observed
            request.sample.sample_count = 1
        if query_type in ['score', 'mixed']:
            set_diff(request.score.data, none_observed)
        requests.append(request)
    for row in load_rows(rows)[:20]:
        i += 1
        request = Query.Request()
        request.id = "example-{}".format(i)
        if query_type in ['sample', 'mixed']:
            request.sample.sample_count = 1
            request.sample.data.MergeFrom(row.diff)
            request.sample.to_sample.sparsity = DENSE
            conditions = izip(nontrivials, row.diff.pos.observed.dense)
            to_sample = [
                nontrivial and not is_observed
                for nontrivial, is_observed in conditions
            ]
            set_observed(request.sample.to_sample, to_sample)
        if query_type in ['score', 'mixed']:
            request.score.data.MergeFrom(row.diff)
        requests.append(request)
    return requests
Beispiel #12
0
def get_example_requests(model, rows, query_type='mixed'):
    assert query_type in ['sample', 'score', 'mixed']
    cross_cat = CrossCat()
    with open_compressed(model, 'rb') as f:
        cross_cat.ParseFromString(f.read())
    feature_count = sum(len(kind.featureids) for kind in cross_cat.kinds)
    featureids = range(feature_count)

    nontrivials = [True] * feature_count
    for kind in cross_cat.kinds:
        fs = iter(kind.featureids)
        for model in loom.schema.MODELS.iterkeys():
            for shared in getattr(kind.product_model, model):
                f = fs.next()
                if model == 'dd':
                    if len(shared.alphas) == 0:
                        nontrivials[f] = False
                elif model == 'dpd':
                    if len(shared.betas) == 0:
                        nontrivials[f] = False
    all_observed = nontrivials[:]
    none_observed = [False] * feature_count

    observeds = []
    observeds.append(all_observed)
    for f, nontrivial in izip(featureids, nontrivials):
        if nontrivial:
            observed = all_observed[:]
            observed[f] = False
            observeds.append(observed)
    for f in featureids:
        observed = [
            nontrivial and sample_bernoulli(0.5)
            for nontrivial in nontrivials
        ]
        observeds.append(observed)
    for f, nontrivial in izip(featureids, nontrivials):
        if nontrivial:
            observed = none_observed[:]
            observed[f] = True
            observeds.append(observed)
    observeds.append(none_observed)

    requests = []
    for i, observed in enumerate(observeds):
        request = Query.Request()
        request.id = "example-{}".format(i)
        if query_type in ['sample', 'mixed']:
            set_diff(request.sample.data, none_observed)
            request.sample.to_sample.sparsity = DENSE
            request.sample.to_sample.dense[:] = observed
            request.sample.sample_count = 1
        if query_type in ['score', 'mixed']:
            set_diff(request.score.data, none_observed)
        requests.append(request)
    for row in load_rows(rows)[:20]:
        i += 1
        request = Query.Request()
        request.id = "example-{}".format(i)
        if query_type in ['sample', 'mixed']:
            request.sample.sample_count = 1
            request.sample.data.MergeFrom(row.diff)
            request.sample.to_sample.sparsity = DENSE
            conditions = izip(nontrivials, row.diff.pos.observed.dense)
            to_sample = [
                nontrivial and not is_observed
                for nontrivial, is_observed in conditions
            ]
            set_observed(request.sample.to_sample, to_sample)
        if query_type in ['score', 'mixed']:
            request.score.data.MergeFrom(row.diff)
        requests.append(request)
    return requests
Beispiel #13
0
def test_samples_match_scores(root, rows, **unused):
    rows = load_rows(rows)
    rows = rows[::len(rows) / 5]
    with loom.query.get_server(root, debug=True) as server:
        for row in rows:
            _check_marginal_samples_match_scores(server, row, 0)