Esempio n. 1
0
def batch_predict(
        config_in,
        model_in,
        groups_in,
        requests,
        debug=False,
        profile=None):
    root = os.path.abspath(os.path.curdir)
    with tempdir(cleanup_on_error=(not debug)):
        requests_in = os.path.abspath('requests.pbs.gz')
        responses_out = os.path.abspath('responses.pbs.gz')
        protobuf_stream_dump(
            (q.SerializeToString() for q in requests),
            requests_in)

        os.chdir(root)
        loom.runner.query(
            config_in=config_in,
            model_in=model_in,
            groups_in=groups_in,
            requests_in=requests_in,
            responses_out=responses_out,
            debug=debug,
            profile=profile)

        return map(parse_response, protobuf_stream_load(responses_out))
Esempio n. 2
0
def crossvalidate_one(seed, test_count, train_count, inputs, results,
                      extra_passes, debug):
    LOG('running seed {}:'.format(seed))
    results['train'] = os.path.join(results['root'], 'train', 'diffs.pbs.gz')
    results['test'] = os.path.join(results['root'], 'test', 'rows.pbs.gz')
    results['scores'] = os.path.join(results['root'], 'scores.json.gz')

    config = {
        'seed': seed,
        'schedule': {
            'extra_passes': extra_passes
        },
    }
    loom.config.config_dump(config, results['samples'][0]['config'])

    numpy.random.seed(seed)
    split = [True] * train_count + [False] * test_count
    numpy.random.shuffle(split)
    diffs_in = protobuf_stream_load(inputs['ingest']['diffs'])
    protobuf_stream_dump((row for s, row in izip(split, diffs_in) if s),
                         results['train'])
    rows_in = protobuf_stream_load(inputs['ingest']['rows'])
    protobuf_stream_dump((row for s, row in izip(split, rows_in) if not s),
                         results['test'])

    LOG(' shuffle')
    loom.runner.shuffle(rows_in=results['train'],
                        rows_out=results['samples'][0]['shuffled'],
                        seed=seed,
                        debug=debug)
    LOG(' init')
    loom.generate.generate_init(encoding_in=inputs['ingest']['encoding'],
                                model_out=results['samples'][0]['init'],
                                seed=seed)
    LOG(' infer')
    loom.runner.infer(config_in=results['samples'][0]['config'],
                      rows_in=results['samples'][0]['shuffled'],
                      tares_in=inputs['ingest']['tares'],
                      model_in=results['samples'][0]['init'],
                      model_out=results['samples'][0]['model'],
                      groups_out=results['samples'][0]['groups'],
                      debug=debug)
    LOG(' query')
    rows = loom.query.load_data_rows(results['test'])
    loom.config.config_dump({}, results['query']['config'])
    with loom.query.get_server(results['root'], debug=debug) as query:
        scores = [query.score(row) for row in rows]

    json_dump(scores, results['scores'])
    LOG(' done\n')
    return numpy.mean(scores)
Esempio n. 3
0
def dump_rows(table, rows_name):
    protobuf_stream_dump(serialize_rows(table), rows_name)
Esempio n. 4
0
def import_tares(schema):
    tare = ProductValue()
    tare.observed.sparsity = ProductValue.Observed.ALL
    tare.booleans[:] = [False] * len(schema)
    protobuf_stream_dump([tare.SerializeToString()], TARES)
Esempio n. 5
0
def import_tares(schema):
    tare = ProductValue()
    tare.observed.sparsity = ProductValue.Observed.ALL
    tare.booleans[:] = [False] * len(schema)
    protobuf_stream_dump([tare.SerializeToString()], TARES)
Esempio n. 6
0
def crossvalidate_one(
        seed,
        test_count,
        train_count,
        inputs,
        results,
        extra_passes,
        debug):
    LOG('running seed {}:'.format(seed))
    results['train'] = os.path.join(
        results['root'],
        'train',
        'diffs.pbs.gz')
    results['test'] = os.path.join(results['root'], 'test', 'rows.pbs.gz')
    results['scores'] = os.path.join(results['root'], 'scores.json.gz')

    config = {
        'seed': seed,
        'schedule': {'extra_passes': extra_passes},
    }
    loom.config.config_dump(config, results['samples'][0]['config'])

    numpy.random.seed(seed)
    split = [True] * train_count + [False] * test_count
    numpy.random.shuffle(split)
    diffs_in = protobuf_stream_load(inputs['ingest']['diffs'])
    protobuf_stream_dump(
        (row for s, row in izip(split, diffs_in) if s),
        results['train'])
    rows_in = protobuf_stream_load(inputs['ingest']['rows'])
    protobuf_stream_dump(
        (row for s, row in izip(split, rows_in) if not s),
        results['test'])

    LOG(' shuffle')
    loom.runner.shuffle(
        rows_in=results['train'],
        rows_out=results['samples'][0]['shuffled'],
        seed=seed,
        debug=debug)
    LOG(' init')
    loom.generate.generate_init(
        encoding_in=inputs['ingest']['encoding'],
        model_out=results['samples'][0]['init'],
        seed=seed)
    LOG(' infer')
    loom.runner.infer(
        config_in=results['samples'][0]['config'],
        rows_in=results['samples'][0]['shuffled'],
        tares_in=inputs['ingest']['tares'],
        model_in=results['samples'][0]['init'],
        model_out=results['samples'][0]['model'],
        groups_out=results['samples'][0]['groups'],
        debug=debug)
    LOG(' query')
    rows = loom.query.load_data_rows(results['test'])
    loom.config.config_dump({}, results['query']['config'])
    with loom.query.get_server(results['root'], debug=debug) as query:
        scores = [query.score(row) for row in rows]

    json_dump(scores, results['scores'])
    LOG(' done\n')
    return numpy.mean(scores)
Esempio n. 7
0
def dump_rows(table, rows_name):
    protobuf_stream_dump(serialize_rows(table), rows_name)
Esempio n. 8
0
def generate_one((name, sample_count, force, debug)):
    paths = loom.store.get_paths(name, sample_count=sample_count)
    if not force and all(os.path.exists(f) for f in paths.itervalues()):
        with open_compressed(paths['ingest']['version']) as f:
            version = f.read().strip()
        if version == loom.__version__:
            return
    print 'generating', name
    mkdir_p(paths['root'])
    with open_compressed(paths['ingest']['version'], 'w') as f:
        f.write(loom.__version__)
    config = CONFIGS[name]
    chunk_size = max(10, (config['row_count'] + 7) / 8)
    loom.transforms.make_fake_transforms(
        transforms_out=paths['ingest']['transforms'])
    loom.generate.generate(
        init_out=paths['samples'][0]['init'],
        rows_out=paths['ingest']['rows'],
        model_out=paths['samples'][0]['model'],
        groups_out=paths['samples'][0]['groups'],
        assign_out=paths['samples'][0]['assign'],
        **config)
    loom.format.make_schema(
        model_in=paths['samples'][0]['model'],
        schema_out=paths['ingest']['schema'])
    loom.format.make_fake_encoding(
        schema_in=paths['ingest']['schema'],
        model_in=paths['samples'][0]['model'],
        encoding_out=paths['ingest']['encoding'])
    loom.format.make_schema_row(
        schema_in=paths['ingest']['schema'],
        schema_row_out=paths['ingest']['schema_row'])
    loom.runner.tare(
        schema_row_in=paths['ingest']['schema_row'],
        rows_in=paths['ingest']['rows'],
        tares_out=paths['ingest']['tares'],
        debug=debug)
    loom.runner.sparsify(
        schema_row_in=paths['ingest']['schema_row'],
        tares_in=paths['ingest']['tares'],
        rows_in=paths['ingest']['rows'],
        rows_out=paths['ingest']['diffs'],
        debug=debug)
    loom.format.export_rows(
        encoding_in=paths['ingest']['encoding'],
        rows_in=paths['ingest']['rows'],
        rows_csv_out=paths['ingest']['rows_csv'],
        chunk_size=chunk_size)
    loom.format.import_rowids(
        rows_csv_in=paths['ingest']['rows_csv'],
        rowids_out=paths['ingest']['rowids'],
        id_field='_id')
    protobuf_stream_dump([], paths['query']['query_log'])
    loom.config.config_dump({}, paths['query']['config'])
    for seed, sample in enumerate(paths['samples']):
        loom.config.config_dump({'seed': seed}, sample['config'])
        loom.generate.generate_init(
            encoding_in=paths['ingest']['encoding'],
            model_out=sample['init'],
            seed=seed)
        loom.runner.shuffle(
            rows_in=paths['ingest']['diffs'],
            rows_out=sample['shuffled'],
            seed=seed,
            debug=debug)
        protobuf_stream_dump([], sample['infer_log'])
    sample0 = paths['samples'][0]
    for seed, sample in enumerate(paths['samples'][1:]):
        if LOOM_DEBUG_MIX:
            cp_ns(sample0['model'], sample['model'])
            cp_ns(sample0['groups'], sample['groups'])
            cp_ns(sample0['assign'], sample['assign'])
        else:
            loom.runner.mix(
                config_in=sample['config'],
                rows_in=paths['ingest']['rows'],
                model_in=sample0['model'],
                groups_in=sample0['groups'],
                assign_in=sample0['assign'],
                model_out=sample['model'],
                groups_out=sample['groups'],
                assign_out=sample['assign'],
                debug=debug)
    loom.consensus.make_fake_consensus(
        paths=paths,
        debug=debug)
Esempio n. 9
0
def generate_one((name, sample_count, force, debug)):
    paths = loom.store.get_paths(name, sample_count=sample_count)
    if not force and all(os.path.exists(f) for f in paths.itervalues()):
        with open_compressed(paths['ingest']['version']) as f:
            version = f.read().strip()
        if version == loom.__version__:
            return
    print 'generating', name
    mkdir_p(paths['root'])
    with open_compressed(paths['ingest']['version'], 'w') as f:
        f.write(loom.__version__)
    config = CONFIGS[name]
    chunk_size = max(10, (config['row_count'] + 7) / 8)
    loom.transforms.make_fake_transforms(
        transforms_out=paths['ingest']['transforms'])
    loom.generate.generate(
        init_out=paths['samples'][0]['init'],
        rows_out=paths['ingest']['rows'],
        model_out=paths['samples'][0]['model'],
        groups_out=paths['samples'][0]['groups'],
        assign_out=paths['samples'][0]['assign'],
        **config)
    loom.format.make_schema(
        model_in=paths['samples'][0]['model'],
        schema_out=paths['ingest']['schema'])
    loom.format.make_fake_encoding(
        schema_in=paths['ingest']['schema'],
        model_in=paths['samples'][0]['model'],
        encoding_out=paths['ingest']['encoding'])
    loom.format.make_schema_row(
        schema_in=paths['ingest']['schema'],
        schema_row_out=paths['ingest']['schema_row'])
    loom.runner.tare(
        schema_row_in=paths['ingest']['schema_row'],
        rows_in=paths['ingest']['rows'],
        tares_out=paths['ingest']['tares'],
        debug=debug)
    loom.runner.sparsify(
        schema_row_in=paths['ingest']['schema_row'],
        tares_in=paths['ingest']['tares'],
        rows_in=paths['ingest']['rows'],
        rows_out=paths['ingest']['diffs'],
        debug=debug)
    loom.format.export_rows(
        encoding_in=paths['ingest']['encoding'],
        rows_in=paths['ingest']['rows'],
        rows_csv_out=paths['ingest']['rows_csv'],
        chunk_size=chunk_size)
    loom.format.import_rowids(
        rows_csv_in=paths['ingest']['rows_csv'],
        rowids_out=paths['ingest']['rowids'],
        id_field='_id')
    protobuf_stream_dump([], paths['query']['query_log'])
    loom.config.config_dump({}, paths['query']['config'])
    for seed, sample in enumerate(paths['samples']):
        loom.config.config_dump({'seed': seed}, sample['config'])
        loom.generate.generate_init(
            encoding_in=paths['ingest']['encoding'],
            model_out=sample['init'],
            seed=seed)
        loom.runner.shuffle(
            rows_in=paths['ingest']['diffs'],
            rows_out=sample['shuffled'],
            seed=seed,
            debug=debug)
        protobuf_stream_dump([], sample['infer_log'])
    sample0 = paths['samples'][0]
    for seed, sample in enumerate(paths['samples'][1:]):
        if LOOM_DEBUG_MIX:
            cp_ns(sample0['model'], sample['model'])
            cp_ns(sample0['groups'], sample['groups'])
            cp_ns(sample0['assign'], sample['assign'])
        else:
            loom.runner.mix(
                config_in=sample['config'],
                rows_in=paths['ingest']['rows'],
                model_in=sample0['model'],
                groups_in=sample0['groups'],
                assign_in=sample0['assign'],
                model_out=sample['model'],
                groups_out=sample['groups'],
                assign_out=sample['assign'],
                debug=debug)
    loom.consensus.make_fake_consensus(
        paths=paths,
        debug=debug)