def make_consensus(name, config=None, debug=False): ''' Combine samples into a single consensus sample. Arguments: name A unique identifier for consensus config An optional json config file currently doesn't do anything but will be used to support e.g. cluster coarseness in the future debug Whether to run debug versions of C++ code Environment varibles: LOOM_VERBOSITY Verbosity level ''' paths = loom.store.get_paths(name) LOG('making config') if config is None: config = {} elif isinstance(config, basestring): if not os.path.exists(config): raise LoomError('Missing config file: {}'.format(config)) config = json_load(config) else: config = copy.deepcopy(config) loom.config.config_dump(config, paths['samples'][0]['config']) LOG('finding consensus') loom.consensus.make_consensus(paths=paths, debug=debug)
def infer_one(name, seed=0, config=None, debug=False): ''' Infer a single sample. Arguments: name A unique identifier for ingest + inference seed The seed, i.e., sample number typically 0-9 config An optional json config file, e.g., {"schedule": {"extra_passes": 500.0}} debug Whether to run debug versions of C++ code Environment variables: LOOM_VERBOSITY Verbosity level ''' paths = loom.store.get_paths(name, sample_count=(1 + seed)) sample = paths['samples'][seed] LOG('making config') if config is None: config = {} elif isinstance(config, basestring): if not os.path.exists(config): raise LoomError('Missing config file: {}'.format(config)) config = json_load(config) else: config = copy.deepcopy(config) if 'seed' not in config: config['seed'] = seed loom.config.config_dump(config, sample['config']) LOG('generating init') loom.generate.generate_init( encoding_in=paths['ingest']['encoding'], model_out=sample['init'], seed=seed) LOG('shuffling rows') loom.runner.shuffle( rows_in=paths['ingest']['diffs'], rows_out=sample['shuffled'], seed=seed, debug=debug) LOG('inferring, watch {}'.format(sample['infer_log'])) loom.runner.infer( config_in=sample['config'], rows_in=sample['shuffled'], tares_in=paths['ingest']['tares'], model_in=sample['init'], model_out=sample['model'], groups_out=sample['groups'], assign_out=sample['assign'], log_out=sample['infer_log'], debug=debug)
def transform( name, schema_csv='schema.csv', rows_csv='rows.csv.gz'): ''' Transform dataset from fluent format to loom internal format. Arguments: name A unique identifier for ingest + inference schema_csv Schema file with columns [feature_name, datatype], e.g. Feature Name,Type full name,id start date,optional_date age,real zipcode,unbounded_categorical description,text Loom assumes the first line is a header and ignores it. Features without datatypes are ignored. rows_csv File or directory of csv files or csv.gz files Environment variables: LOOM_THREADS Number of concurrent ingest tasks LOOM_VERBOSITY Verbosity level ''' if not os.path.exists(schema_csv): raise LoomError('Missing schema_csv file: {}'.format(schema_csv)) if not os.path.exists(rows_csv): raise LoomError('Missing rows_csv file: {}'.format(rows_csv)) paths = loom.store.get_paths(name) LOG('making transforms') id_field = loom.transforms.make_transforms( schema_in=schema_csv, rows_in=rows_csv, schema_out=paths['ingest']['schema'], transforms_out=paths['ingest']['transforms']) LOG('transforming rows') loom.transforms.transform_rows( schema_in=paths['ingest']['schema'], transforms_in=paths['ingest']['transforms'], rows_in=rows_csv, rows_out=paths['ingest']['rows_csv'], id_field=id_field)
def ingest(name, schema=None, rows_csv=None, id_field=None, debug=False): ''' Ingest dataset with optional json config. Arguments: name A unique identifier for ingest + inference schema Json schema file, e.g., {"feature1": "nich"} rows_csv File or directory of csv files or csv.gz files id_field Column name of id field in input csv debug Whether to run debug versions of C++ code Environment variables: LOOM_THREADS Number of concurrent ingest tasks LOOM_VERBOSITY Verbosity level ''' paths = loom.store.get_paths(name) if schema is None: schema = paths['ingest']['schema'] if rows_csv is None: rows_csv = paths['ingest']['rows_csv'] if not os.path.exists(schema): raise LoomError('Missing schema file: {}'.format(schema)) if not os.path.exists(rows_csv): raise LoomError('Missing rows_csv file: {}'.format(rows_csv)) with open_compressed(paths['ingest']['version'], 'w') as f: f.write(loom.__version__) LOG('making schema row') loom.format.make_schema_row(schema_in=schema, schema_row_out=paths['ingest']['schema_row']) LOG('making encoding') loom.format.make_encoding(schema_in=schema, rows_in=rows_csv, encoding_out=paths['ingest']['encoding']) LOG('importing rows') loom.format.import_rows(encoding_in=paths['ingest']['encoding'], rows_csv_in=rows_csv, rows_out=paths['ingest']['rows']) LOG('importing rowids') loom.format.import_rowids(rows_csv_in=rows_csv, rowids_out=paths['ingest']['rowids'], id_field=id_field) LOG('making tare rows') loom.runner.tare(schema_row_in=paths['ingest']['schema_row'], rows_in=paths['ingest']['rows'], tares_out=paths['ingest']['tares'], debug=debug) tare_count = sum(1 for _ in protobuf_stream_load(paths['ingest']['tares'])) LOG('sparsifying rows WRT {} tare rows'.format(tare_count)) loom.runner.sparsify(schema_row_in=paths['ingest']['schema_row'], tares_in=paths['ingest']['tares'], rows_in=paths['ingest']['rows'], rows_out=paths['ingest']['diffs'], debug=debug) loom.config.config_dump({}, paths['query']['config'])
def make_transforms(schema_in, rows_in, schema_out, transforms_out): fluent_schema = load_schema(schema_in) basic_schema = {} pre_transforms = [] transforms = [] builders = [] dates = [ feature_name for feature_name, fluent_type in fluent_schema.iteritems() if fluent_type.endswith('date') ] id_field = None for feature_name, fluent_type in fluent_schema.iteritems(): # parse adjectives if fluent_type.startswith('optional_'): transform = PresenceTransform(feature_name) pre_transforms.append(transform) transforms.append(transform) fluent_type = fluent_type[len('optional_'):] feature_name = '{}.value'.format(feature_name) # parse nouns if fluent_type == 'id': id_field = feature_name elif fluent_type in ['categorical', 'unbounded_categorical']: transforms.append(StringTransform(feature_name, fluent_type)) elif fluent_type == 'percent': transforms.append(PercentTransform(feature_name)) elif fluent_type == 'sparse_real': transforms.append(SparseRealTransform(feature_name)) elif fluent_type == 'text': builders.append(TextTransformBuilder(feature_name)) elif fluent_type == 'tags': builders.append( TextTransformBuilder(feature_name, allow_empty=True)) elif fluent_type == 'date': relatives = [other for other in dates if other < feature_name] transforms.append(DateTransform(feature_name, relatives)) else: basic_type = FLUENT_TO_BASIC[fluent_type] basic_schema[feature_name] = basic_type if builders: transforms += build_transforms(rows_in, pre_transforms, builders) for transform in transforms: basic_schema.update(transform.get_schema()) json_dump(basic_schema, schema_out) pickle_dump(transforms, transforms_out) LOG('transformed {} -> {} features'.format( len(fluent_schema), len(basic_schema))) return id_field
def query(name, config=None, debug=False, profile=None): ''' Start the query server. Arguments: name A unique identifier for ingest + inference config An optional json config file debug Whether to run debug versions of C++ code Environment varibles: LOOM_VERBOSITY Verbosity level ''' paths = loom.store.get_paths(name) LOG('starting query server') server = loom.preql.get_server(paths['root'], paths['ingest']['encoding'], config=config, debug=debug, profile=profile) return server
def crossvalidate_one(seed, test_count, train_count, inputs, results, extra_passes, debug): LOG('running seed {}:'.format(seed)) results['train'] = os.path.join(results['root'], 'train', 'diffs.pbs.gz') results['test'] = os.path.join(results['root'], 'test', 'rows.pbs.gz') results['scores'] = os.path.join(results['root'], 'scores.json.gz') config = { 'seed': seed, 'schedule': { 'extra_passes': extra_passes }, } loom.config.config_dump(config, results['samples'][0]['config']) numpy.random.seed(seed) split = [True] * train_count + [False] * test_count numpy.random.shuffle(split) diffs_in = protobuf_stream_load(inputs['ingest']['diffs']) protobuf_stream_dump((row for s, row in izip(split, diffs_in) if s), results['train']) rows_in = protobuf_stream_load(inputs['ingest']['rows']) protobuf_stream_dump((row for s, row in izip(split, rows_in) if not s), results['test']) LOG(' shuffle') loom.runner.shuffle(rows_in=results['train'], rows_out=results['samples'][0]['shuffled'], seed=seed, debug=debug) LOG(' init') loom.generate.generate_init(encoding_in=inputs['ingest']['encoding'], model_out=results['samples'][0]['init'], seed=seed) LOG(' infer') loom.runner.infer(config_in=results['samples'][0]['config'], rows_in=results['samples'][0]['shuffled'], tares_in=inputs['ingest']['tares'], model_in=results['samples'][0]['init'], model_out=results['samples'][0]['model'], groups_out=results['samples'][0]['groups'], debug=debug) LOG(' query') rows = loom.query.load_data_rows(results['test']) loom.config.config_dump({}, results['query']['config']) with loom.query.get_server(results['root'], debug=debug) as query: scores = [query.score(row) for row in rows] json_dump(scores, results['scores']) LOG(' done\n') return numpy.mean(scores)
def test_all(name, schema, rows_csv, **unused): name = os.path.join(name, 'test_tasks') paths = loom.store.get_paths(name) loom.datasets.clean(name) loom.tasks.ingest(name, schema, rows_csv, debug=True) loom.tasks.infer( name, sample_count=SAMPLE_COUNT, config=CONFIG, debug=True) loom.tasks.make_consensus(name, debug=True) LOG('querying') requests = get_example_requests( paths['samples'][0]['model'], paths['ingest']['rows']) with loom.tasks.query(paths['root'], debug=True) as server: pbserver = server._query_server.protobuf_server for request in requests: pbserver.send(request) response = pbserver.receive() check_response(request, response)