Beispiel #1
0
def test_infer(name, tares, shuffled, init, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        row_count = sum(1 for _ in protobuf_stream_load(shuffled))
        with open_compressed(init) as f:
            message = CrossCat()
            message.ParseFromString(f.read())
        kind_count = len(message.kinds)

        for config in CONFIGS:
            loom.config.fill_in_defaults(config)
            schedule = config['schedule']
            print 'config: {}'.format(config)

            greedy = (schedule['extra_passes'] == 0)
            kind_iters = config['kernels']['kind']['iterations']
            kind_structure_is_fixed = greedy or kind_iters == 0

            with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
                config_in = os.path.abspath('config.pb.gz')
                model_out = os.path.abspath('model.pb.gz')
                groups_out = os.path.abspath('groups')
                assign_out = os.path.abspath('assign.pbs.gz')
                log_out = os.path.abspath('log.pbs.gz')
                os.mkdir(groups_out)
                loom.config.config_dump(config, config_in)
                loom.runner.infer(
                    config_in=config_in,
                    rows_in=shuffled,
                    tares_in=tares,
                    model_in=init,
                    model_out=model_out,
                    groups_out=groups_out,
                    assign_out=assign_out,
                    log_out=log_out,
                    debug=True,
                )

                if kind_structure_is_fixed:
                    assert_equal(len(os.listdir(groups_out)), kind_count)

                group_counts = get_group_counts(groups_out)

                assign_count = sum(1 for _ in protobuf_stream_load(assign_out))
                assert_equal(assign_count, row_count)

            print 'row_count: {}'.format(row_count)
            print 'group_counts: {}'.format(' '.join(map(str, group_counts)))
            for group_count in group_counts:
                assert_true(group_count <= row_count,
                            'groups are all singletons')
Beispiel #2
0
def test_infer(name, tares, shuffled, init, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        row_count = sum(1 for _ in protobuf_stream_load(shuffled))
        with open_compressed(init) as f:
            message = CrossCat()
            message.ParseFromString(f.read())
        kind_count = len(message.kinds)

        for config in CONFIGS:
            loom.config.fill_in_defaults(config)
            schedule = config['schedule']
            print 'config: {}'.format(config)

            greedy = (schedule['extra_passes'] == 0)
            kind_iters = config['kernels']['kind']['iterations']
            kind_structure_is_fixed = greedy or kind_iters == 0

            with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
                config_in = os.path.abspath('config.pb.gz')
                model_out = os.path.abspath('model.pb.gz')
                groups_out = os.path.abspath('groups')
                assign_out = os.path.abspath('assign.pbs.gz')
                log_out = os.path.abspath('log.pbs.gz')
                os.mkdir(groups_out)
                loom.config.config_dump(config, config_in)
                loom.runner.infer(
                    config_in=config_in,
                    rows_in=shuffled,
                    tares_in=tares,
                    model_in=init,
                    model_out=model_out,
                    groups_out=groups_out,
                    assign_out=assign_out,
                    log_out=log_out,
                    debug=True,)

                if kind_structure_is_fixed:
                    assert_equal(len(os.listdir(groups_out)), kind_count)

                group_counts = get_group_counts(groups_out)

                assign_count = sum(1 for _ in protobuf_stream_load(assign_out))
                assert_equal(assign_count, row_count)

            print 'row_count: {}'.format(row_count)
            print 'group_counts: {}'.format(' '.join(map(str, group_counts)))
            for group_count in group_counts:
                assert_true(
                    group_count <= row_count,
                    'groups are all singletons')
Beispiel #3
0
def test_generate(model, **unused):
    for row_count in [0, 1, 100]:
        for density in [0.0, 0.5, 1.0]:
            with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
                config_in = os.path.abspath('config.pb.gz')
                config = {
                    'generate': {
                        'row_count': row_count,
                        'density': density,
                    },
                }
                loom.config.config_dump(config, config_in)
                assert_found(config_in)

                rows_out = os.path.abspath('rows.pbs.gz')
                model_out = os.path.abspath('model.pb.gz')
                groups_out = os.path.abspath('groups')
                loom.runner.generate(config_in=config_in,
                                     model_in=model,
                                     rows_out=rows_out,
                                     model_out=model_out,
                                     groups_out=groups_out,
                                     debug=True)
                assert_found(rows_out, model_out, groups_out)

                group_counts = get_group_counts(groups_out)
                print 'group_counts: {}'.format(' '.join(map(
                    str, group_counts)))
Beispiel #4
0
def test_posterior_enum(name, tares, diffs, init, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        config_in = os.path.abspath('config.pb.gz')
        config = {
            'posterior_enum': {
                'sample_count': 7,
            },
            'kernels': {
                'kind': {
                    'row_queue_capacity': 0,
                    'score_parallel': False,
                },
            },
        }
        loom.config.config_dump(config, config_in)
        assert_found(config_in)

        samples_out = os.path.abspath('samples.pbs.gz')
        loom.runner.posterior_enum(config_in=config_in,
                                   model_in=init,
                                   tares_in=tares,
                                   rows_in=diffs,
                                   samples_out=samples_out,
                                   debug=True)
        assert_found(samples_out)
        actual_count = sum(1 for _ in protobuf_stream_load(samples_out))
        assert_equal(actual_count, config['posterior_enum']['sample_count'])
Beispiel #5
0
def test_generate(init, **unused):
    for row_count in [0, 1, 100]:
        for density in [0.0, 0.5, 1.0]:
            with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
                config_in = os.path.abspath('config.pb.gz')
                config = {
                    'generate': {
                        'row_count': row_count,
                        'density': density,
                    },
                }
                loom.config.config_dump(config, config_in)
                assert_true(os.path.exists(config_in))

                rows_out = os.path.abspath('rows.pbs.gz')
                model_out = os.path.abspath('model.pb.gz')
                groups_out = os.path.abspath('groups')
                loom.runner.generate(
                    config_in=config_in,
                    model_in=init,
                    rows_out=rows_out,
                    model_out=model_out,
                    groups_out=groups_out,
                    debug=True)
                assert_true(os.path.exists(rows_out))
                assert_true(os.path.exists(model_out))
                assert_true(os.path.exists(groups_out))

                group_counts = get_group_counts(groups_out)
                print 'group_counts: {}'.format(
                    ' '.join(map(str, group_counts)))
Beispiel #6
0
def test_generate_init(encoding, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        init_out = os.path.abspath('init.pb.gz')
        loom.generate.generate_init(
            encoding_in=encoding,
            model_out=init_out)
        assert_found(init_out)
Beispiel #7
0
def generate(
        feature_type='mixed',
        rows=10000,
        cols=100,
        density=0.5,
        debug=False,
        profile='time'):
    '''
    Generate a synthetic dataset.
    '''
    root = os.path.abspath(os.path.curdir)
    with tempdir(cleanup_on_error=(not debug)):
        init_out = os.path.abspath('init.pb.gz')
        rows_out = os.path.abspath('rows.pbs.gz')
        model_out = os.path.abspath('model.pb.gz')
        groups_out = os.path.abspath('groups')

        os.chdir(root)
        loom.generate.generate(
            row_count=rows,
            feature_count=cols,
            feature_type=feature_type,
            density=density,
            init_out=init_out,
            rows_out=rows_out,
            model_out=model_out,
            groups_out=groups_out,
            debug=debug,
            profile=profile)

        print 'model file is {} bytes'.format(os.path.getsize(model_out))
        print 'rows file is {} bytes'.format(os.path.getsize(rows_out))
Beispiel #8
0
def test_make_schema(model, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        schema_out = os.path.abspath('schema.json.gz')
        loom.format.make_schema(
            model_in=model,
            schema_out=schema_out)
        assert_found(schema_out)
Beispiel #9
0
def test_tare(rows, schema_row, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        tares = os.path.abspath('tares.pbs.gz')
        loom.runner.tare(schema_row_in=schema_row,
                         rows_in=rows,
                         tares_out=tares)
        assert_found(tares)
Beispiel #10
0
def test_make_fake_encoding(schema, model, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        encoding_out = os.path.abspath('encoding.json.gz')
        loom.format.make_fake_encoding(schema_in=schema,
                                       model_in=model,
                                       encoding_out=encoding_out)
        assert_found(encoding_out)
Beispiel #11
0
def batch_predict(
        config_in,
        model_in,
        groups_in,
        requests,
        debug=False,
        profile=None):
    root = os.path.abspath(os.path.curdir)
    with tempdir(cleanup_on_error=(not debug)):
        requests_in = os.path.abspath('requests.pbs.gz')
        responses_out = os.path.abspath('responses.pbs.gz')
        protobuf_stream_dump(
            (q.SerializeToString() for q in requests),
            requests_in)

        os.chdir(root)
        loom.runner.query(
            config_in=config_in,
            model_in=model_in,
            groups_in=groups_in,
            requests_in=requests_in,
            responses_out=responses_out,
            debug=debug,
            profile=profile)

        return map(parse_response, protobuf_stream_load(responses_out))
Beispiel #12
0
def test_generate_init(encoding, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        init_out = os.path.abspath('init.pb.gz')
        loom.generate.generate_init(
            encoding_in=encoding,
            model_out=init_out)
        assert_found(init_out)
Beispiel #13
0
def _import_dir(import_file, args):
    rows_csv_in, file_out, id_offset, id_stride, misc = args
    assert os.path.isdir(rows_csv_in)
    parts_in = sorted(
        os.path.abspath(os.path.join(rows_csv_in, f))
        for f in os.listdir(rows_csv_in)
    )
    part_count = len(parts_in)
    assert part_count > 0, 'no files in {}'.format(rows_csv_in)
    parts_out = []
    tasks = []
    for i, part_in in enumerate(parts_in):
        part_out = 'part.{}.{}'.format(i, os.path.basename(file_out))
        offset = id_offset + id_stride * i
        stride = id_stride * part_count
        parts_out.append(part_out)
        tasks.append((part_in, part_out, offset, stride, misc))
    with tempdir():
        loom.util.parallel_map(import_file, tasks)
        # It is safe use open instead of open_compressed even for .gz files;
        # see http://stackoverflow.com/questions/8005114
        with open(file_out, 'wb') as whole:
            for part_out in parts_out:
                with open(part_out, 'rb') as part:
                    shutil.copyfileobj(part, whole)
                os.remove(part_out)
Beispiel #14
0
def test_predict(root, rows_csv, encoding, **unused):
    COUNT = 10
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        with loom.query.get_server(root, debug=True) as query_server:
            result_out = 'predictions_out.csv'
            rows_in = os.listdir(rows_csv)[0]
            rows_in = os.path.join(rows_csv, rows_in)
            encoders = json_load(encoding)
            name_to_encoder = {e['name']: load_encoder(e) for e in encoders}
            preql = loom.preql.PreQL(query_server, encoding)
            preql.predict(rows_in, COUNT, result_out, id_offset=False)
            with open_compressed(rows_in, 'rb') as fin:
                with open(result_out, 'r') as fout:
                    in_reader = csv.reader(fin)
                    out_reader = csv.reader(fout)
                    fnames = in_reader.next()
                    out_reader.next()
                    for in_row in in_reader:
                        for i in range(COUNT):
                            out_row = out_reader.next()
                            bundle = zip(fnames, in_row, out_row)
                            for name, in_val, out_val in bundle:
                                encode = name_to_encoder[name]
                                observed = bool(in_val.strip())
                                if observed:
                                    assert_almost_equal(
                                        encode(in_val),
                                        encode(out_val))
                                else:
                                    assert_true(bool(out_val.strip()))
Beispiel #15
0
def test_make_schema(model, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        schema_out = os.path.abspath('schema.json.gz')
        loom.format.make_schema(
            model_in=model,
            schema_out=schema_out)
        assert_found(schema_out)
Beispiel #16
0
def _import_rows_dir(encoding_in, rows_csv_in, rows_out, id_offset, id_stride):
    assert os.path.isdir(rows_csv_in)
    files_in = sorted(
        os.path.abspath(os.path.join(rows_csv_in, f))
        for f in os.listdir(rows_csv_in)
    )
    file_count = len(files_in)
    assert file_count > 0, 'no files in {}'.format(rows_csv_in)
    assert file_count < 1e6, 'too many files in {}'.format(rows_csv_in)
    files_out = []
    tasks = []
    for i, file_in in enumerate(files_in):
        file_out = 'part_{:06d}.{}'.format(i, os.path.basename(rows_out))
        offset = id_offset + id_stride * i
        stride = id_stride * file_count
        files_out.append(file_out)
        tasks.append((encoding_in, file_in, file_out, offset, stride))
    rows_out = os.path.abspath(rows_out)
    with tempdir():
        loom.util.parallel_map(_import_rows_file, tasks)
        # It is safe use open instead of open_compressed even for .gz files;
        # see http://stackoverflow.com/questions/8005114
        with open(rows_out, 'wb') as whole:
            for file_out in files_out:
                with open(file_out, 'rb') as part:
                    shutil.copyfileobj(part, whole)
                os.remove(file_out)
Beispiel #17
0
def _import_dir(import_file, args):
    rows_csv_in, file_out, id_offset, id_stride, misc = args
    assert os.path.isdir(rows_csv_in)
    parts_in = sorted(
        os.path.abspath(os.path.join(rows_csv_in, f))
        for f in os.listdir(rows_csv_in))
    part_count = len(parts_in)
    assert part_count > 0, 'no files in {}'.format(rows_csv_in)
    parts_out = []
    tasks = []
    for i, part_in in enumerate(parts_in):
        part_out = 'part.{}.{}'.format(i, os.path.basename(file_out))
        offset = id_offset + id_stride * i
        stride = id_stride * part_count
        parts_out.append(part_out)
        tasks.append((part_in, part_out, offset, stride, misc))
    with tempdir():
        loom.util.parallel_map(import_file, tasks)
        # It is safe use open instead of open_compressed even for .gz files;
        # see http://stackoverflow.com/questions/8005114
        with open(file_out, 'wb') as whole:
            for part_out in parts_out:
                with open(part_out, 'rb') as part:
                    shutil.copyfileobj(part, whole)
                os.remove(part_out)
Beispiel #18
0
def test_posterior_enum(rows, model, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        config_in = os.path.abspath('config.pb.gz')
        config = {
            'posterior_enum': {
                'sample_count': 7,
            },
            'kernels': {
                'kind': {
                    'row_queue_capacity': 0,
                    'score_parallel': False,
                },
            },
        }
        loom.config.config_dump(config, config_in)
        assert_true(os.path.exists(config_in))

        samples_out = os.path.abspath('samples.pbs.gz')
        loom.runner.posterior_enum(
            config_in=config_in,
            model_in=model,
            rows_in=rows,
            samples_out=samples_out,
            debug=True)
        assert_true(os.path.exists(samples_out))
        actual_count = sum(1 for _ in protobuf_stream_load(samples_out))
        assert_equal(actual_count, config['posterior_enum']['sample_count'])
Beispiel #19
0
def test_tare(rows, schema_row, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        tares = os.path.abspath('tares.pbs.gz')
        loom.runner.tare(
            schema_row_in=schema_row,
            rows_in=rows,
            tares_out=tares)
        assert_found(tares)
Beispiel #20
0
def test_sparsify(rows, schema_row, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        tares = os.path.abspath("tares.pbs.gz")
        diffs = os.path.abspath("diffs.pbs.gz")
        loom.runner.tare(schema_row_in=schema_row, rows_in=rows, tares_out=tares)
        assert_found(tares)
        loom.runner.sparsify(schema_row_in=schema_row, tares_in=tares, rows_in=rows, rows_out=diffs, debug=True)
        assert_found(diffs)
Beispiel #21
0
def test_samples_match_scores(root, rows, **unused):
    rows = load_rows(rows)
    rows = rows[::len(rows) / 5]
    with tempdir():
        loom.config.config_dump({'seed': SEED}, 'config.pb.gz')
        with loom.query.get_server(root, 'config.pb.gz', debug=True) as server:
            for row in rows:
                _check_marginal_samples_match_scores(server, row, 0)
Beispiel #22
0
def test_samples_match_scores(root, rows, **unused):
    rows = load_rows(rows)
    rows = rows[::len(rows) / 5]
    with tempdir():
        loom.config.config_dump({'seed': SEED}, 'config.pb.gz')
        with loom.query.get_server(root, 'config.pb.gz', debug=True) as server:
            for row in rows:
                _check_marginal_samples_match_scores(server, row, 0)
Beispiel #23
0
def test_group_runs(root, schema, encoding, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        with loom.preql.get_server(root, encoding, debug=True) as preql:
            test_columns = json_load(schema).keys()[:10]
            for column in test_columns:
                groupings_csv = 'group.{}.csv'.format(column)
                preql.group(column, result_out=groupings_csv)
                print open(groupings_csv).read()
Beispiel #24
0
def test_group_runs(root, schema, encoding, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        with loom.preql.get_server(root, encoding, debug=True) as preql:
            test_columns = json_load(schema).keys()[:10]
            for column in test_columns:
                groupings_csv = 'group.{}.csv'.format(column)
                preql.group(column, result_out=groupings_csv)
                print open(groupings_csv).read()
Beispiel #25
0
def test_make_fake_encoding(schema, model, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        encoding_out = os.path.abspath('encoding.json.gz')
        loom.format.make_fake_encoding(
            schema_in=schema,
            model_in=model,
            encoding_out=encoding_out)
        assert_found(encoding_out)
Beispiel #26
0
def test_transforms():
    name = 'test_transforms.test_transforms'
    with tempdir() as temp:
        schema_csv = os.path.join(temp, 'schema.csv')
        rows_csv = os.path.join(temp, 'rows.csv.gz')
        generate_example(schema_csv, rows_csv)
        loom.tasks.transform(name, schema_csv, rows_csv)
        loom.tasks.ingest(name)
        loom.tasks.infer(name, sample_count=1)
def _test_dump_load(dump, load, filetype):
    for example in EXAMPLES:
        print example
        with fileutil.tempdir() as d, fileutil.chdir(d):
            expected = example
            filename = 'test.json' + filetype
            dump(expected, filename)
            actual = list(load(filename))
            assert_equal(actual, expected)
Beispiel #28
0
def _test_protobuf_stream(filetype):
    filename = 'test.stream' + filetype
    expected = ['asdf', '', 'asdfasdfasdf', 'a', 's', '', '', '', 'd', 'f']
    with fileutil.tempdir():
        print 'dumping'
        io.stream.protobuf_stream_dump(expected, filename)
        print 'loading'
        actual = list(io.stream.protobuf_stream_load(filename))
    assert_equal(actual, expected)
Beispiel #29
0
def test_shuffle(rows, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        seed = 12345
        rows_out = os.path.abspath('rows_out.pbs.gz')
        loom.runner.shuffle(
            rows_in=rows,
            rows_out=rows_out,
            seed=seed)
        assert_true(os.path.exists(rows_out))
Beispiel #30
0
def test_shuffle(diffs, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        seed = 12345
        rows_out = os.path.abspath('shuffled.pbs.gz')
        loom.runner.shuffle(
            rows_in=diffs,
            rows_out=rows_out,
            seed=seed)
        assert_found(rows_out)
Beispiel #31
0
def test_relate(root, encoding, **unused):
    with loom.query.get_server(root, debug=True) as query_server:
        with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
            result_out = 'related_out.csv'
            preql = loom.preql.PreQL(query_server, encoding)
            preql.relate(preql.feature_names, result_out, sample_count=10)
            with open(result_out, 'r') as f:
                reader = csv.reader(f)
                for row in reader:
                    pass
Beispiel #32
0
def test_import_rows(encoding, rows, rows_csv, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        rows_pbs = os.path.abspath('rows.pbs.gz')
        loom.format.import_rows(encoding_in=encoding,
                                rows_csv_in=rows_csv,
                                rows_out=rows_pbs)
        assert_found(rows_pbs)
        expected_count = sum(1 for _ in protobuf_stream_load(rows))
        actual_count = sum(1 for _ in protobuf_stream_load(rows_pbs))
        assert_equal(actual_count, expected_count)
Beispiel #33
0
def test_predict(root, rows_csv, encoding, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        with loom.preql.get_server(root, debug=True) as preql:
            result_out = 'predictions_out.csv'
            rows_in = os.listdir(rows_csv)[0]
            rows_in = os.path.join(rows_csv, rows_in)
            preql.predict(rows_in, COUNT, result_out, id_offset=True)
            print 'DEBUG', open_compressed(rows_in).read()
            print 'DEBUG', open_compressed(result_out).read()
            _check_predictions(rows_in, result_out, encoding)
Beispiel #34
0
def test_seed(root, model, rows, **unused):
    requests = get_example_requests(model, rows, 'mixed')
    with tempdir():
        loom.config.config_dump({'seed': 0}, 'config.pb.gz')
        with loom.query.ProtobufServer(root, config='config.pb.gz') as server:
            responses1 = [get_response(server, req) for req in requests]

    with tempdir():
        loom.config.config_dump({'seed': 0}, 'config.pb.gz')
        with loom.query.ProtobufServer(root, config='config.pb.gz') as server:
            responses2 = [get_response(server, req) for req in requests]

    with tempdir():
        loom.config.config_dump({'seed': 10}, 'config.pb.gz')
        with loom.query.ProtobufServer(root, config='config.pb.gz') as server:
            responses3 = [get_response(server, req) for req in requests]

    assert_equal(responses1, responses2)
    assert_not_equal(responses1, responses3)
Beispiel #35
0
def _test_pair(dump, load, filetype):
    dump, load = named_pairs[dump, load]
    for example in EXAMPLES:
        print example
        with fileutil.tempdir():
            expected = example
            filename = 'test.json' + filetype
            dump(expected, filename)
            actual = list(load(filename))
            assert_equal(actual, expected)
Beispiel #36
0
def test_predict(root, rows_csv, encoding, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        with loom.preql.get_server(root, debug=True) as preql:
            result_out = 'predictions_out.csv'
            rows_in = os.listdir(rows_csv)[0]
            rows_in = os.path.join(rows_csv, rows_in)
            preql.predict(rows_in, COUNT, result_out, id_offset=True)
            print 'DEBUG', open_compressed(rows_in).read()
            print 'DEBUG', open_compressed(result_out).read()
            _check_predictions(rows_in, result_out, encoding)
Beispiel #37
0
def test_import_rows(encoding, rows, rows_csv, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        rows_pbs = os.path.abspath('rows.pbs.gz')
        loom.format.import_rows(
            encoding_in=encoding,
            rows_csv_in=rows_csv,
            rows_out=rows_pbs)
        assert_found(rows_pbs)
        expected_count = sum(1 for _ in protobuf_stream_load(rows))
        actual_count = sum(1 for _ in protobuf_stream_load(rows_pbs))
        assert_equal(actual_count, expected_count)
Beispiel #38
0
def test_make_encoding(schema, rows_csv, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        encoding = os.path.abspath('encoding.json.gz')
        rows = os.path.abspath('rows.pbs.gz')
        loom.format.make_encoding(schema_in=schema,
                                  rows_in=rows_csv,
                                  encoding_out=encoding)
        assert_found(encoding)
        loom.format.import_rows(encoding_in=encoding,
                                rows_csv_in=rows_csv,
                                rows_out=rows)
        assert_found(rows)
Beispiel #39
0
def test_sparsify(rows, schema_row, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        tares = os.path.abspath('tares.pbs.gz')
        diffs = os.path.abspath('diffs.pbs.gz')
        loom.runner.tare(schema_row_in=schema_row,
                         rows_in=rows,
                         tares_out=tares)
        assert_found(tares)
        loom.runner.sparsify(schema_row_in=schema_row,
                             tares_in=tares,
                             rows_in=rows,
                             rows_out=diffs,
                             debug=True)
        assert_found(diffs)
Beispiel #40
0
def test_similar_runs(root, rows_csv, **unused):
    rows = load_rows_csv(rows_csv)
    header = rows.pop(0)
    try:
        id_pos = header.index('_id')
    except ValueError:
        id_pos = None
    rows = rows[0:10]
    for row in rows:
        row.pop(id_pos)
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        with loom.preql.get_server(root, debug=True) as preql:
            search_csv = 'search.csv'
            preql.similar(rows, result_out=search_csv)
Beispiel #41
0
def test_similar_runs(root, rows_csv, **unused):
    rows = load_rows_csv(rows_csv)
    header = rows.pop(0)
    try:
        id_pos = header.index('_id')
    except ValueError:
        id_pos = None
    rows = rows[0:10]
    for row in rows:
        row.pop(id_pos)
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        with loom.preql.get_server(root, debug=True) as preql:
            search_csv = 'search.csv'
            preql.similar(rows, result_out=search_csv)
Beispiel #42
0
def test_make_encoding(schema, rows_csv, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        encoding = os.path.abspath('encoding.json.gz')
        rows = os.path.abspath('rows.pbs.gz')
        loom.format.make_encoding(
            schema_in=schema,
            rows_in=rows_csv,
            encoding_out=encoding)
        assert_found(encoding)
        loom.format.import_rows(
            encoding_in=encoding,
            rows_csv_in=rows_csv,
            rows_out=rows)
        assert_found(rows)
Beispiel #43
0
def test_server(model, groups, **unused):
    requests = get_example_requests(model)
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        config_in = os.path.abspath('config.pb.gz')
        loom.config.config_dump(CONFIG, config_in)
        kwargs = {
            'config_in': config_in,
            'model_in': model,
            'groups_in': groups,
            'debug': True,
        }
        with loom.query.serve(**kwargs) as server:
            responses = [server.call_protobuf(request) for request in requests]

    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        config_in = os.path.abspath('config.pb.gz')
        loom.config.config_dump(CONFIG, config_in)
        kwargs = {
            'config_in': config_in,
            'model_in': model,
            'groups_in': groups,
            'debug': True,
        }
        with loom.query.serve(**kwargs) as server:
            for request in requests:
                req = Query.Request()
                req.id = request.id
                req.score.data.observed[:] = request.sample.data.observed[:]
                res = server.call_protobuf(req)
                assert_equal(req.id, res.id)
                assert_false(hasattr(req, 'error'))
                assert_true(isinstance(res.score.score, float))

    for request, response in izip(requests, responses):
        assert_equal(request.id, response.id)
        assert_false(hasattr(request, 'error'))
        assert_equal(len(response.sample.samples), 1)
Beispiel #44
0
def test_search_runs(root, rows_csv, **unused):
    rows = load_rows_csv(rows_csv)
    header = rows.pop(0)
    try:
        id_pos = header.index('_id')
    except ValueError:
        id_pos = None
    rows = rows[0:10]
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        with loom.preql.get_server(root, debug=True) as preql:
            for i, row in enumerate(rows):
                row.pop(id_pos)
                search_csv = 'search.{}.csv'.format(i)
                preql.search(row, result_out=search_csv)
                open(search_csv).read()
Beispiel #45
0
def test_search_runs(root, rows_csv, **unused):
    rows = load_rows_csv(rows_csv)
    header = rows.pop(0)
    try:
        id_pos = header.index('_id')
    except ValueError:
        id_pos = None
    rows = rows[0:10]
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        with loom.preql.get_server(root, debug=True) as preql:
            for i, row in enumerate(rows):
                row.pop(id_pos)
                search_csv = 'search.{}.csv'.format(i)
                preql.search(row, result_out=search_csv)
                open(search_csv).read()
Beispiel #46
0
def test_one_to_one(rows, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        seed = 12345
        rows_out = os.path.abspath('rows_out.pbs.gz')
        loom.runner.shuffle(rows_in=rows, rows_out=rows_out, seed=seed)
        assert_found(rows_out)

        original = load_rows(rows)
        shuffled = load_rows(rows_out)
        assert_equal(len(shuffled), len(original))
        assert_not_equal(shuffled, original)

        actual = sorted(shuffled, key=lambda row: row.id)
        expected = sorted(original, key=lambda row: row.id)
        assert_list_equal(expected, actual)
Beispiel #47
0
def test_batch_predict(model, groups, **unused):
    requests = get_example_requests(model)
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        config_in = os.path.abspath('config.pb.gz')
        loom.config.config_dump(CONFIG, config_in)
        responses = loom.query.batch_predict(
            config_in=config_in,
            model_in=model,
            groups_in=groups,
            requests=requests,
            debug=True)
    assert_equal(len(responses), len(requests))
    for request, response in izip(requests, responses):
        assert_equal(request.id, response.id)
        assert_false(hasattr(request, 'error'))
        assert_equal(len(response.sample.samples), 1)
Beispiel #48
0
def _test_generate(feature_type):
    root = os.getcwd()
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        init_out = os.path.abspath('init.pb.gz')
        rows_out = os.path.abspath('rows.pbs.gz')
        model_out = os.path.abspath('model.pb.gz')
        groups_out = os.path.abspath('groups')
        os.chdir(root)
        loom.generate.generate(feature_type=feature_type,
                               row_count=100,
                               feature_count=100,
                               density=0.5,
                               init_out=init_out,
                               rows_out=rows_out,
                               model_out=model_out,
                               groups_out=groups_out,
                               debug=True,
                               profile=None)
Beispiel #49
0
def test_relate(root, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        with loom.preql.get_server(root, debug=True) as preql:
            result_out = 'related_out.csv'
            preql.relate(preql.feature_names, result_out, sample_count=10)
            with open(result_out, 'r') as f:
                reader = csv.reader(f)
                header = reader.next()
                columns = header[1:]
                assert_equal(columns, preql.feature_names)
                zmatrix = numpy.zeros((len(columns), len(columns)))
                for i, row in enumerate(reader):
                    column = row.pop(0)
                    assert_equal(column, preql.feature_names[i])
                    for j, score in enumerate(row):
                        score = float(score)
                        zmatrix[i][j] = score
                assert_close(zmatrix, zmatrix.T)
Beispiel #50
0
def test_chunking(rows, **unused):
    targets = [10.0**i for i in xrange(6)]
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        seed = 12345
        rows_out = os.path.abspath('rows.out.{}.pbs.gz')

        for i, target in enumerate(targets):
            loom.runner.shuffle(rows_in=rows,
                                rows_out=rows_out.format(i),
                                seed=seed,
                                target_mem_bytes=target)

        results = [
            load_rows_raw(rows_out.format(i)) for i in xrange(len(targets))
        ]
        for i, actual in enumerate(results):
            for expected in results[:i]:
                assert_list_equal(actual, expected)
Beispiel #51
0
def test_export_rows(encoding, rows, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        rows_csv = os.path.abspath('rows_csv')
        rows_pbs = os.path.abspath('rows.pbs.gz')
        loom.format.export_rows(encoding_in=encoding,
                                rows_in=rows,
                                rows_csv_out=rows_csv,
                                chunk_size=51)
        assert_found(rows_csv)
        assert_found(os.path.join(rows_csv, 'rows.0.csv.gz'))
        loom.format.import_rows(encoding_in=encoding,
                                rows_csv_in=rows_csv,
                                rows_out=rows_pbs)
        assert_found(rows_pbs)
        expected = load_rows(rows)
        actual = load_rows(rows_pbs)
        assert_equal(len(actual), len(expected))
        actual.sort(key=lambda row: row.id)
        expected.sort(key=lambda row: row.id)
        expected_data = [row.diff for row in expected]
        actual_data = [row.diff for row in actual]
        assert_close(actual_data, expected_data)
Beispiel #52
0
def test_shuffle(diffs, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        seed = 12345
        rows_out = os.path.abspath('shuffled.pbs.gz')
        loom.runner.shuffle(rows_in=diffs, rows_out=rows_out, seed=seed)
        assert_found(rows_out)