def batch_predict( config_in, model_in, groups_in, requests, debug=False, profile=None): root = os.path.abspath(os.path.curdir) with tempdir(cleanup_on_error=(not debug)): requests_in = os.path.abspath('requests.pbs.gz') responses_out = os.path.abspath('responses.pbs.gz') protobuf_stream_dump( (q.SerializeToString() for q in requests), requests_in) os.chdir(root) loom.runner.query( config_in=config_in, model_in=model_in, groups_in=groups_in, requests_in=requests_in, responses_out=responses_out, debug=debug, profile=profile) return map(parse_response, protobuf_stream_load(responses_out))
def crossvalidate_one(seed, test_count, train_count, inputs, results, extra_passes, debug): LOG('running seed {}:'.format(seed)) results['train'] = os.path.join(results['root'], 'train', 'diffs.pbs.gz') results['test'] = os.path.join(results['root'], 'test', 'rows.pbs.gz') results['scores'] = os.path.join(results['root'], 'scores.json.gz') config = { 'seed': seed, 'schedule': { 'extra_passes': extra_passes }, } loom.config.config_dump(config, results['samples'][0]['config']) numpy.random.seed(seed) split = [True] * train_count + [False] * test_count numpy.random.shuffle(split) diffs_in = protobuf_stream_load(inputs['ingest']['diffs']) protobuf_stream_dump((row for s, row in izip(split, diffs_in) if s), results['train']) rows_in = protobuf_stream_load(inputs['ingest']['rows']) protobuf_stream_dump((row for s, row in izip(split, rows_in) if not s), results['test']) LOG(' shuffle') loom.runner.shuffle(rows_in=results['train'], rows_out=results['samples'][0]['shuffled'], seed=seed, debug=debug) LOG(' init') loom.generate.generate_init(encoding_in=inputs['ingest']['encoding'], model_out=results['samples'][0]['init'], seed=seed) LOG(' infer') loom.runner.infer(config_in=results['samples'][0]['config'], rows_in=results['samples'][0]['shuffled'], tares_in=inputs['ingest']['tares'], model_in=results['samples'][0]['init'], model_out=results['samples'][0]['model'], groups_out=results['samples'][0]['groups'], debug=debug) LOG(' query') rows = loom.query.load_data_rows(results['test']) loom.config.config_dump({}, results['query']['config']) with loom.query.get_server(results['root'], debug=debug) as query: scores = [query.score(row) for row in rows] json_dump(scores, results['scores']) LOG(' done\n') return numpy.mean(scores)
def dump_rows(table, rows_name): protobuf_stream_dump(serialize_rows(table), rows_name)
def import_tares(schema): tare = ProductValue() tare.observed.sparsity = ProductValue.Observed.ALL tare.booleans[:] = [False] * len(schema) protobuf_stream_dump([tare.SerializeToString()], TARES)
def crossvalidate_one( seed, test_count, train_count, inputs, results, extra_passes, debug): LOG('running seed {}:'.format(seed)) results['train'] = os.path.join( results['root'], 'train', 'diffs.pbs.gz') results['test'] = os.path.join(results['root'], 'test', 'rows.pbs.gz') results['scores'] = os.path.join(results['root'], 'scores.json.gz') config = { 'seed': seed, 'schedule': {'extra_passes': extra_passes}, } loom.config.config_dump(config, results['samples'][0]['config']) numpy.random.seed(seed) split = [True] * train_count + [False] * test_count numpy.random.shuffle(split) diffs_in = protobuf_stream_load(inputs['ingest']['diffs']) protobuf_stream_dump( (row for s, row in izip(split, diffs_in) if s), results['train']) rows_in = protobuf_stream_load(inputs['ingest']['rows']) protobuf_stream_dump( (row for s, row in izip(split, rows_in) if not s), results['test']) LOG(' shuffle') loom.runner.shuffle( rows_in=results['train'], rows_out=results['samples'][0]['shuffled'], seed=seed, debug=debug) LOG(' init') loom.generate.generate_init( encoding_in=inputs['ingest']['encoding'], model_out=results['samples'][0]['init'], seed=seed) LOG(' infer') loom.runner.infer( config_in=results['samples'][0]['config'], rows_in=results['samples'][0]['shuffled'], tares_in=inputs['ingest']['tares'], model_in=results['samples'][0]['init'], model_out=results['samples'][0]['model'], groups_out=results['samples'][0]['groups'], debug=debug) LOG(' query') rows = loom.query.load_data_rows(results['test']) loom.config.config_dump({}, results['query']['config']) with loom.query.get_server(results['root'], debug=debug) as query: scores = [query.score(row) for row in rows] json_dump(scores, results['scores']) LOG(' done\n') return numpy.mean(scores)
def generate_one((name, sample_count, force, debug)): paths = loom.store.get_paths(name, sample_count=sample_count) if not force and all(os.path.exists(f) for f in paths.itervalues()): with open_compressed(paths['ingest']['version']) as f: version = f.read().strip() if version == loom.__version__: return print 'generating', name mkdir_p(paths['root']) with open_compressed(paths['ingest']['version'], 'w') as f: f.write(loom.__version__) config = CONFIGS[name] chunk_size = max(10, (config['row_count'] + 7) / 8) loom.transforms.make_fake_transforms( transforms_out=paths['ingest']['transforms']) loom.generate.generate( init_out=paths['samples'][0]['init'], rows_out=paths['ingest']['rows'], model_out=paths['samples'][0]['model'], groups_out=paths['samples'][0]['groups'], assign_out=paths['samples'][0]['assign'], **config) loom.format.make_schema( model_in=paths['samples'][0]['model'], schema_out=paths['ingest']['schema']) loom.format.make_fake_encoding( schema_in=paths['ingest']['schema'], model_in=paths['samples'][0]['model'], encoding_out=paths['ingest']['encoding']) loom.format.make_schema_row( schema_in=paths['ingest']['schema'], schema_row_out=paths['ingest']['schema_row']) loom.runner.tare( schema_row_in=paths['ingest']['schema_row'], rows_in=paths['ingest']['rows'], tares_out=paths['ingest']['tares'], debug=debug) loom.runner.sparsify( schema_row_in=paths['ingest']['schema_row'], tares_in=paths['ingest']['tares'], rows_in=paths['ingest']['rows'], rows_out=paths['ingest']['diffs'], debug=debug) loom.format.export_rows( encoding_in=paths['ingest']['encoding'], rows_in=paths['ingest']['rows'], rows_csv_out=paths['ingest']['rows_csv'], chunk_size=chunk_size) loom.format.import_rowids( rows_csv_in=paths['ingest']['rows_csv'], rowids_out=paths['ingest']['rowids'], id_field='_id') protobuf_stream_dump([], paths['query']['query_log']) loom.config.config_dump({}, paths['query']['config']) for seed, sample in enumerate(paths['samples']): loom.config.config_dump({'seed': seed}, sample['config']) loom.generate.generate_init( encoding_in=paths['ingest']['encoding'], model_out=sample['init'], seed=seed) loom.runner.shuffle( rows_in=paths['ingest']['diffs'], rows_out=sample['shuffled'], seed=seed, debug=debug) protobuf_stream_dump([], sample['infer_log']) sample0 = paths['samples'][0] for seed, sample in enumerate(paths['samples'][1:]): if LOOM_DEBUG_MIX: cp_ns(sample0['model'], sample['model']) cp_ns(sample0['groups'], sample['groups']) cp_ns(sample0['assign'], sample['assign']) else: loom.runner.mix( config_in=sample['config'], rows_in=paths['ingest']['rows'], model_in=sample0['model'], groups_in=sample0['groups'], assign_in=sample0['assign'], model_out=sample['model'], groups_out=sample['groups'], assign_out=sample['assign'], debug=debug) loom.consensus.make_fake_consensus( paths=paths, debug=debug)