def pretty_print(filename, message_type='guess'): ''' Print text/json/protobuf messages from a raw/gz/bz2 file. ''' parts = os.path.basename(filename).split('.') if parts[-1] in ['gz', 'bz2']: parts.pop() protocol = parts[-1] if protocol == 'json': data = json_load(filename) print json.dumps(data, sort_keys=True, indent=4) elif protocol == 'pb': message = get_message(filename, message_type) with open_compressed(filename) as f: message.ParseFromString(f.read()) print message elif protocol == 'pbs': message = get_message(filename, message_type) for string in protobuf_stream_load(filename): message.ParseFromString(string) print message elif protocol == 'pickle': data = pickle_load(filename) print repr(data) else: with open_compressed(filename) as f: for line in f: print line,
def import_rows(): row = Row() pos = row.diff.pos neg = row.diff.neg pos.observed.sparsity = ProductValue.Observed.SPARSE neg.observed.sparsity = ProductValue.Observed.SPARSE with open_compressed(RAW) as infile: doc_count = int(infile.next()) word_count = int(infile.next()) observed_count = int(infile.next()) print 'Importing {} observations of {} words in {} documents'.format( observed_count, word_count, doc_count) with open_compressed(DIFFS, 'wb') as outfile: current_doc = None for line in infile: doc, feature, count = line.split() if doc != current_doc: if current_doc is not None: pos.observed.sparse.sort() neg.observed.sparse.sort() protobuf_stream_write(row.SerializeToString(), outfile) print_dot(every=1000) current_doc = doc row.id = int(doc) del pos.booleans[:] del pos.observed.sparse[:] del neg.booleans[:] del neg.observed.sparse[:] feature = int(feature) - 1 pos.observed.sparse.append(feature) pos.booleans.append(True) neg.observed.sparse.append(feature) neg.booleans.append(False) protobuf_stream_write(row.SerializeToString(), outfile)
def test_predict(root, rows_csv, encoding, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): with loom.preql.get_server(root, debug=True) as preql: result_out = 'predictions_out.csv' rows_in = os.listdir(rows_csv)[0] rows_in = os.path.join(rows_csv, rows_in) preql.predict(rows_in, COUNT, result_out, id_offset=True) print 'DEBUG', open_compressed(rows_in).read() print 'DEBUG', open_compressed(result_out).read() _check_predictions(rows_in, result_out, encoding)
def force_ascii(filename_in, filename_out=None, size=4096): with ExitStack() as stack: with_ = stack.enter_context if filename_out is None: filename_out = with_(loom.util.temp_copy(filename_in)) source = with_(open_compressed(filename_in, 'rb')) destin = with_(open_compressed(filename_out, 'w')) chunk = source.read(size) while chunk: destin.write(re_nonascii.sub('', chunk)) chunk = source.read(size)
def config_dump(config, filename): config = deepcopy(config) fill_in_defaults(config) message = loom.schema_pb2.Config() protobuf_dump(config, message) with open_compressed(filename, 'wb') as f: f.write(message.SerializeToString())
def test_predict(root, rows_csv, encoding, **unused): COUNT = 10 with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): with loom.query.get_server(root, debug=True) as query_server: result_out = 'predictions_out.csv' rows_in = os.listdir(rows_csv)[0] rows_in = os.path.join(rows_csv, rows_in) encoders = json_load(encoding) name_to_encoder = {e['name']: load_encoder(e) for e in encoders} preql = loom.preql.PreQL(query_server, encoding) preql.predict(rows_in, COUNT, result_out, id_offset=False) with open_compressed(rows_in, 'rb') as fin: with open(result_out, 'r') as fout: in_reader = csv.reader(fin) out_reader = csv.reader(fout) fnames = in_reader.next() out_reader.next() for in_row in in_reader: for i in range(COUNT): out_row = out_reader.next() bundle = zip(fnames, in_row, out_row) for name, in_val, out_val in bundle: encode = name_to_encoder[name] observed = bool(in_val.strip()) if observed: assert_almost_equal( encode(in_val), encode(out_val)) else: assert_true(bool(out_val.strip()))
def _check_predictions(rows_in, result_out, encoding): encoders = json_load(encoding) name_to_encoder = {e['name']: load_encoder(e) for e in encoders} with open_compressed(rows_in, 'rb') as fin: with open(result_out, 'r') as fout: in_reader = csv.reader(fin) out_reader = csv.reader(fout) fnames = in_reader.next() out_reader.next() for in_row in in_reader: for i in range(COUNT): out_row = out_reader.next() bundle = zip(fnames, in_row, out_row) for name, in_val, out_val in bundle: if name == '_id': assert_equal(in_val, out_val) continue encode = name_to_encoder[name] observed = bool(in_val.strip()) if observed: assert_almost_equal( encode(in_val), encode(out_val)) else: assert_true(bool(out_val.strip()))
def get_example_requests(model): cross_cat = CrossCat() with open_compressed(model) as f: cross_cat.ParseFromString(f.read()) feature_count = sum(len(kind.featureids) for kind in cross_cat.kinds) all_observed = [True] * feature_count none_observed = [False] * feature_count observeds = [] observeds.append(all_observed) for f in xrange(feature_count): observed = all_observed[:] observed[f] = False observeds.append(observed) for f in xrange(feature_count): observed = [sample_bernoulli(0.5) for _ in xrange(feature_count)] observeds.append(observed) for f in xrange(feature_count): observed = none_observed[:] observed[f] = True observeds.append(observed) observeds.append(none_observed) requests = [] for i, observed in enumerate(observeds): request = Query.Request() request.id = "example-{}".format(i) request.sample.data.observed[:] = none_observed request.sample.to_sample[:] = observed request.sample.sample_count = 1 requests.append(request) return requests
def _import_rows_file(args): encoding_in, rows_csv_in, rows_out, id_offset, id_stride = args assert os.path.isfile(rows_csv_in) encoders = json_load(encoding_in) message = loom.cFormat.Row() add_field = { 'booleans': message.add_booleans, 'counts': message.add_counts, 'reals': message.add_reals, } with open_compressed(rows_csv_in, 'rb') as f: reader = csv.reader(f) feature_names = list(reader.next()) name_to_pos = {name: i for i, name in enumerate(feature_names)} schema = [] for encoder in encoders: pos = name_to_pos.get(encoder['name']) add = add_field[loom.schema.MODEL_TO_DATATYPE[encoder['model']]] encode = load_encoder(encoder) schema.append((pos, add, encode)) def rows(): for i, row in enumerate(reader): message.id = id_offset + id_stride * i for pos, add, encode in schema: value = None if pos is None else row[pos].strip() observed = bool(value) message.add_observed(observed) if observed: add(encode(value)) yield message message.Clear() loom.cFormat.row_stream_dump(rows(), rows_out)
def make_fake_encoding(schema_in, model_in, encoding_out): ''' Make a fake encoding from json schema + model. Assume that feature names in schema correspond to featureids in model e.g. schema was generated from loom.format.make_schema ''' schema = json_load(schema_in) fields = [] builders = [] name_to_builder = {} for name, model in sorted(schema.iteritems()): fields.append(loom.schema.MODEL_TO_DATATYPE[model]) Builder = FAKE_ENCODER_BUILDERS[model] builder = Builder(name, model) builders.append(builder) name_to_builder[name] = builder cross_cat = loom.schema_pb2.CrossCat() with open_compressed(model_in, 'rb') as f: cross_cat.ParseFromString(f.read()) for kind in cross_cat.kinds: featureid = iter(kind.featureids) for model in loom.schema.MODELS.iterkeys(): for shared in getattr(kind.product_model, model): feature_name = '{:06d}'.format(featureid.next()) assert feature_name in schema if model == 'dd': for i in range(len(shared.alphas)): name_to_builder[feature_name].add_value(str(i)) elif model == 'dpd': for val in shared.values: name_to_builder[feature_name].add_value(str(val)) encoders = [b.build() for b in builders] ensure_fake_encoders_are_sorted(encoders) json_dump(encoders, encoding_out)
def ingest(name, schema=None, rows_csv=None, id_field=None, debug=False): ''' Ingest dataset with optional json config. Arguments: name A unique identifier for ingest + inference schema Json schema file, e.g., {"feature1": "nich"} rows_csv File or directory of csv files or csv.gz files id_field Column name of id field in input csv debug Whether to run debug versions of C++ code Environment variables: LOOM_THREADS Number of concurrent ingest tasks LOOM_VERBOSITY Verbosity level ''' paths = loom.store.get_paths(name) if schema is None: schema = paths['ingest']['schema'] if rows_csv is None: rows_csv = paths['ingest']['rows_csv'] if not os.path.exists(schema): raise LoomError('Missing schema file: {}'.format(schema)) if not os.path.exists(rows_csv): raise LoomError('Missing rows_csv file: {}'.format(rows_csv)) with open_compressed(paths['ingest']['version'], 'w') as f: f.write(loom.__version__) LOG('making schema row') loom.format.make_schema_row(schema_in=schema, schema_row_out=paths['ingest']['schema_row']) LOG('making encoding') loom.format.make_encoding(schema_in=schema, rows_in=rows_csv, encoding_out=paths['ingest']['encoding']) LOG('importing rows') loom.format.import_rows(encoding_in=paths['ingest']['encoding'], rows_csv_in=rows_csv, rows_out=paths['ingest']['rows']) LOG('importing rowids') loom.format.import_rowids(rows_csv_in=rows_csv, rowids_out=paths['ingest']['rowids'], id_field=id_field) LOG('making tare rows') loom.runner.tare(schema_row_in=paths['ingest']['schema_row'], rows_in=paths['ingest']['rows'], tares_out=paths['ingest']['tares'], debug=debug) tare_count = sum(1 for _ in protobuf_stream_load(paths['ingest']['tares'])) LOG('sparsifying rows WRT {} tare rows'.format(tare_count)) loom.runner.sparsify(schema_row_in=paths['ingest']['schema_row'], tares_in=paths['ingest']['tares'], rows_in=paths['ingest']['rows'], rows_out=paths['ingest']['diffs'], debug=debug) loom.config.config_dump({}, paths['query']['config'])
def _loom_cross_cat(path, sample): """Return the loom CrossCat structure at `path`, whose id is `sample`.""" model_in = os.path.join(path, 'samples', 'sample.%d' % (sample, ), 'model.pb.gz') cross_cat = loom.schema_pb2.CrossCat() with open_compressed(model_in, 'rb') as f: cross_cat.ParseFromString(f.read()) return cross_cat
def predict(self, rows_csv, count, result_out, id_offset=True): with open_compressed(rows_csv, 'rb') as fin: with open_compressed(result_out, 'w') as fout: reader = csv.reader(fin) writer = csv.writer(fout) feature_names = list(reader.next()) writer.writerow(feature_names) name_to_pos = {name: i for i, name in enumerate(feature_names)} pos_to_decode = {} schema = [] for encoder in self.encoders: pos = name_to_pos.get(encoder['name']) encode = load_encoder(encoder) decode = load_decoder(encoder) if pos is not None: pos_to_decode[pos] = decode schema.append((pos, encode)) for row in reader: conditioning_row = [] to_sample = [] if id_offset: row_id = row.pop(0) for pos, encode, in schema: value = None if pos is None else row[pos].strip() observed = bool(value) to_sample.append((not observed)) if observed is False: conditioning_row.append(None) else: conditioning_row.append(encode(value)) samples = self.query_server.sample( to_sample, conditioning_row, count) for sample in samples: if id_offset: out_row = [row_id] else: out_row = [] for name in feature_names: pos = name_to_pos[name] decode = pos_to_decode[pos] val = sample[pos] out_row.append(val) writer.writerow(out_row)
def csv_output(arg): if arg is None: outfile = StringIO() yield CsvWriter(outfile, returns=outfile.getvalue) elif hasattr(arg, 'write'): yield CsvWriter(arg) else: with open_compressed(arg, 'w') as outfile: yield CsvWriter(outfile)
def _get_cross_cat(self, bdb, generator_id, modelno): """Return the loom CrossCat structure whose id is `modelno`.""" model_in = os.path.join( self._get_loom_project_path(bdb, generator_id), 'samples', 'sample.%d' % (modelno,), 'model.pb.gz') cross_cat = loom.schema_pb2.CrossCat() with open_compressed(model_in, 'rb') as f: cross_cat.ParseFromString(f.read()) return cross_cat
def _get_cross_cat(self, bdb, generator_id, modelno): """Return the loom CrossCat structure whose id is `modelno`.""" model_in = os.path.join(self._get_loom_project_path(bdb, generator_id), 'samples', 'sample.%d' % (modelno, ), 'model.pb.gz') cross_cat = loom.schema_pb2.CrossCat() with open_compressed(model_in, 'rb') as f: cross_cat.ParseFromString(f.read()) return cross_cat
def group_sample((sample, featureid)): model = CrossCat() with open_compressed(sample['model']) as f: model.ParseFromString(f.read()) for kindid, kind in enumerate(model.kinds): if featureid in kind.featureids: break assignments = assignment_stream_load(sample['assign']) return collate((a.groupids(kindid), a.rowid) for a in assignments)
def generate_init(encoding_in, model_out, seed=0): ''' Generate an initial model for inference. ''' numpy.random.seed(seed) encoders = json_load(encoding_in) features = import_features(encoders) cross_cat = generate_model(features) with open_compressed(model_out, 'wb') as f: f.write(cross_cat.SerializeToString())
def ingest(name, schema='schema.json', rows_csv='rows.csv.gz', debug=False): ''' Ingest dataset with optional json config. Arguments: name A unique identifier for ingest + inference schema Json schema file, e.g., {"feature1": "nich"} rows_csv File or directory of csv files debug Whether to run debug versions of C++ code Environment variables: LOOM_THREADS Number of concurrent ingest tasks LOOM_VERBOSITY Verbosity level ''' if not os.path.exists(schema): raise IOError('Missing schema file: {}'.format(schema)) if not os.path.exists(rows_csv): raise IOError('Missing rows_csv file: {}'.format(rows_csv)) paths = loom.store.get_paths(name) with open_compressed(paths['ingest']['version'], 'w') as f: f.write(loom.__version__) LOG('making schema row') loom.format.make_schema_row( schema_in=schema, schema_row_out=paths['ingest']['schema_row']) LOG('making encoding') loom.format.make_encoding( schema_in=schema, rows_in=rows_csv, encoding_out=paths['ingest']['encoding']) LOG('importing rows') loom.format.import_rows( encoding_in=paths['ingest']['encoding'], rows_csv_in=rows_csv, rows_out=paths['ingest']['rows']) LOG('making tare rows') loom.runner.tare( schema_row_in=paths['ingest']['schema_row'], rows_in=paths['ingest']['rows'], tares_out=paths['ingest']['tares'], debug=debug) tare_count = sum(1 for _ in protobuf_stream_load(paths['ingest']['tares'])) LOG('sparsifying rows WRT {} tare rows'.format(tare_count)) loom.runner.sparsify( schema_row_in=paths['ingest']['schema_row'], tares_in=paths['ingest']['tares'], rows_in=paths['ingest']['rows'], rows_out=paths['ingest']['diffs'], debug=debug)
def make_schema_row(schema_in, schema_row_out): ''' Convert json schema to protobuf schema row. ''' schema = json_load(schema_in) value = loom.schema_pb2.ProductValue() value.observed.sparsity = loom.schema_pb2.ProductValue.Observed.DENSE for model in schema.itervalues(): field = loom.schema.MODEL_TO_DATATYPE[model] value.observed.dense.append(True) getattr(value, field).append(EXAMPLE_VALUES[field]) with open_compressed(schema_row_out, 'wb') as f: f.write(value.SerializeToString())
def test_infer(name, tares, shuffled, init, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): row_count = sum(1 for _ in protobuf_stream_load(shuffled)) with open_compressed(init) as f: message = CrossCat() message.ParseFromString(f.read()) kind_count = len(message.kinds) for config in CONFIGS: loom.config.fill_in_defaults(config) schedule = config['schedule'] print 'config: {}'.format(config) greedy = (schedule['extra_passes'] == 0) kind_iters = config['kernels']['kind']['iterations'] kind_structure_is_fixed = greedy or kind_iters == 0 with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): config_in = os.path.abspath('config.pb.gz') model_out = os.path.abspath('model.pb.gz') groups_out = os.path.abspath('groups') assign_out = os.path.abspath('assign.pbs.gz') log_out = os.path.abspath('log.pbs.gz') os.mkdir(groups_out) loom.config.config_dump(config, config_in) loom.runner.infer( config_in=config_in, rows_in=shuffled, tares_in=tares, model_in=init, model_out=model_out, groups_out=groups_out, assign_out=assign_out, log_out=log_out, debug=True,) if kind_structure_is_fixed: assert_equal(len(os.listdir(groups_out)), kind_count) group_counts = get_group_counts(groups_out) assign_count = sum(1 for _ in protobuf_stream_load(assign_out)) assert_equal(assign_count, row_count) print 'row_count: {}'.format(row_count) print 'group_counts: {}'.format(' '.join(map(str, group_counts))) for group_count in group_counts: assert_true( group_count <= row_count, 'groups are all singletons')
def test_infer(name, tares, shuffled, init, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): row_count = sum(1 for _ in protobuf_stream_load(shuffled)) with open_compressed(init) as f: message = CrossCat() message.ParseFromString(f.read()) kind_count = len(message.kinds) for config in CONFIGS: loom.config.fill_in_defaults(config) schedule = config['schedule'] print 'config: {}'.format(config) greedy = (schedule['extra_passes'] == 0) kind_iters = config['kernels']['kind']['iterations'] kind_structure_is_fixed = greedy or kind_iters == 0 with tempdir(cleanup_on_error=CLEANUP_ON_ERROR): config_in = os.path.abspath('config.pb.gz') model_out = os.path.abspath('model.pb.gz') groups_out = os.path.abspath('groups') assign_out = os.path.abspath('assign.pbs.gz') log_out = os.path.abspath('log.pbs.gz') os.mkdir(groups_out) loom.config.config_dump(config, config_in) loom.runner.infer( config_in=config_in, rows_in=shuffled, tares_in=tares, model_in=init, model_out=model_out, groups_out=groups_out, assign_out=assign_out, log_out=log_out, debug=True, ) if kind_structure_is_fixed: assert_equal(len(os.listdir(groups_out)), kind_count) group_counts = get_group_counts(groups_out) assign_count = sum(1 for _ in protobuf_stream_load(assign_out)) assert_equal(assign_count, row_count) print 'row_count: {}'.format(row_count) print 'group_counts: {}'.format(' '.join(map(str, group_counts))) for group_count in group_counts: assert_true(group_count <= row_count, 'groups are all singletons')
def create_dataset(row_count=ROW_COUNT): ''' Extract dataset from image. ''' scipy.misc.imsave(os.path.join(RESULTS, 'original.png'), IMAGE) print 'sampling {} points from image'.format(row_count) with open_compressed(SAMPLES, 'w') as f: writer = csv.writer(f) writer.writerow(['x', 'y']) for row in sample_from_image(IMAGE, row_count): writer.writerow(row) with csv_reader(SAMPLES) as reader: reader.next() image = visualize_dataset(map(float, row) for row in reader) scipy.misc.imsave(os.path.join(RESULTS, 'samples.png'), image)
def generate( feature_type='mixed', row_count=1000, feature_count=100, density=0.5, rows_out='rows.pbs.gz', model_out='model.pb.gz', groups_out=None, assign_out=None, init_out=None, debug=False, profile=None): ''' Generate a synthetic dataset. ''' root = os.getcwd() rows_out = os.path.abspath(rows_out) model_out = os.path.abspath(model_out) if groups_out is not None: groups_out = os.path.abspath(groups_out) if assign_out is not None: assign_out = os.path.abspath(assign_out) if init_out is not None: init_out = os.path.abspath(init_out) features = generate_features(feature_count, feature_type) model = generate_model(features) with tempdir(cleanup_on_error=(not debug)): if init_out is None: init_out = os.path.abspath('init.pb.gz') with open_compressed(init_out, 'wb') as f: f.write(model.SerializeToString()) config = {'generate': {'row_count': row_count, 'density': density}} config_in = os.path.abspath('config.pb.gz') loom.config.config_dump(config, config_in) os.chdir(root) loom.runner.generate( config_in=config_in, model_in=init_out, rows_out=rows_out, model_out=model_out, groups_out=groups_out, assign_out=assign_out, debug=debug, profile=profile)
def make_schema(model_in, schema_out): ''' Make a schema from a protobuf model. ''' cross_cat = loom.schema_pb2.CrossCat() with open_compressed(model_in, 'rb') as f: cross_cat.ParseFromString(f.read()) schema = {} for kind in cross_cat.kinds: featureid = iter(kind.featureids) for model in loom.schema.MODELS.iterkeys(): for shared in getattr(kind.product_model, model): feature_name = '{:06d}'.format(featureid.next()) schema[feature_name] = model json_dump(schema, schema_out) return schema
def generate(feature_type='mixed', row_count=1000, feature_count=100, density=0.5, rows_out='rows.pbs.gz', model_out='model.pb.gz', groups_out=None, assign_out=None, init_out=None, debug=False, profile=None): ''' Generate a synthetic dataset. ''' root = os.getcwd() rows_out = os.path.abspath(rows_out) model_out = os.path.abspath(model_out) if groups_out is not None: groups_out = os.path.abspath(groups_out) if assign_out is not None: assign_out = os.path.abspath(assign_out) if init_out is not None: init_out = os.path.abspath(init_out) features = generate_features(feature_count, feature_type) model = generate_model(features) with tempdir(cleanup_on_error=(not debug)): if init_out is None: init_out = os.path.abspath('init.pb.gz') with open_compressed(init_out, 'wb') as f: f.write(model.SerializeToString()) config = {'generate': {'row_count': row_count, 'density': density}} config_in = os.path.abspath('config.pb.gz') loom.config.config_dump(config, config_in) os.chdir(root) loom.runner.generate(config_in=config_in, model_in=init_out, rows_out=rows_out, model_out=model_out, groups_out=groups_out, assign_out=assign_out, debug=debug, profile=profile)
def make_schema_row(schema_in, schema_row_out): ''' Convert json schema to protobuf schema row. ''' schema = json_load(schema_in) if not schema: raise LoomError('Schema is empty: {}'.format(schema_in)) value = loom.schema_pb2.ProductValue() value.observed.sparsity = loom.schema_pb2.ProductValue.Observed.DENSE for model in schema.itervalues(): try: field = loom.schema.MODEL_TO_DATATYPE[model] except KeyError: raise LoomError('Unknown model {} in schema {}'.format( model, schema_in)) value.observed.dense.append(True) getattr(value, field).append(EXAMPLE_VALUES[field]) with open_compressed(schema_row_out, 'wb') as f: f.write(value.SerializeToString())
def test_predict_pandas(root, rows_csv, schema, **unused): feature_count = len(json_load(schema)) with loom.preql.get_server(root, debug=True) as preql: rows_filename = os.path.join(rows_csv, os.listdir(rows_csv)[0]) with open_compressed(rows_filename) as f: rows_df = pandas.read_csv(f, converters=preql.converters, index_col='_id') print 'rows_df =' print rows_df row_count = rows_df.shape[0] assert_equal(rows_df.shape[1], feature_count) rows_io = StringIO(rows_df.to_csv()) result_string = preql.predict(rows_io, COUNT, id_offset=True) result_df = pandas.read_csv(StringIO(result_string), index_col=False) print 'result_df =' print result_df assert_equal(result_df.ndim, 2) assert_equal(result_df.shape[0], row_count * COUNT) assert_equal(result_df.shape[1], 1 + feature_count)
def test_predict_pandas(root, rows_csv, schema, **unused): feature_count = len(json_load(schema)) with loom.preql.get_server(root, debug=True) as preql: rows_filename = os.path.join(rows_csv, os.listdir(rows_csv)[0]) with open_compressed(rows_filename) as f: rows_df = pandas.read_csv( f, converters=preql.converters, index_col='_id') print 'rows_df =' print rows_df row_count = rows_df.shape[0] assert_equal(rows_df.shape[1], feature_count) rows_io = StringIO(rows_df.to_csv()) result_string = preql.predict(rows_io, COUNT, id_offset=True) result_df = pandas.read_csv(StringIO(result_string), index_col=False) print 'result_df =' print result_df assert_equal(result_df.ndim, 2) assert_equal(result_df.shape[0], row_count * COUNT) assert_equal(result_df.shape[1], 1 + feature_count)
def _make_encoder_builders_file((schema_in, rows_in)): assert os.path.isfile(rows_in) schema = json_load(schema_in) with open_compressed(rows_in, 'rb') as f: reader = csv.reader(f) header = reader.next() builders = [] for name in header: if name in schema: model = schema[name] Builder = ENCODER_BUILDERS[model] builder = Builder(name, model) else: builder = None builders.append(builder) for row in reader: for value, builder in izip(row, builders): if builder is not None: value = value.strip() if value: builder.add_value(value) return [b for b in builders if b is not None]
def generate( feature_type='mixed', row_count=1000, feature_count=100, density=0.5, init_out='init.pb.gz', rows_out='rows.pbs.gz', model_out='model.pb.gz', groups_out='groups', debug=False, profile=None): ''' Generate a synthetic dataset. ''' root = os.path.abspath(os.path.curdir) init_out = os.path.abspath(init_out) rows_out = os.path.abspath(rows_out) model_out = os.path.abspath(model_out) groups_out = os.path.abspath(groups_out) model = generate_model(row_count, feature_count, feature_type, density) with open_compressed(init_out, 'w') as f: f.write(model.SerializeToString()) with tempdir(cleanup_on_error=(not debug)): config = {'generate': {'row_count': row_count, 'density': density}} config_in = os.path.abspath('config.pb.gz') loom.config.config_dump(config, config_in) os.chdir(root) loom.runner.generate( config_in=config_in, model_in=init_out, rows_out=rows_out, model_out=model_out, groups_out=groups_out, debug=debug, profile=profile)
def export_rows(encoding_in, rows_in, rows_csv_out, chunk_size=1000000): ''' Export rows from protobuf stream to csv. ''' for ext in ['.csv', '.gz', '.bz2']: assert not rows_csv_out.endswith(ext),\ 'rows_csv_out should be a dirname' assert chunk_size > 0 encoders = json_load(encoding_in) fields = [loom.schema.MODEL_TO_DATATYPE[e['model']] for e in encoders] decoders = [load_decoder(e) for e in encoders] header = [e['name'] for e in encoders] if os.path.exists(rows_csv_out): shutil.rmtree(rows_csv_out) os.makedirs(rows_csv_out) rows = loom.cFormat.row_stream_load(rows_in) try: empty = None for i in xrange(MAX_CHUNK_COUNT): file_out = os.path.join( rows_csv_out, 'rows_{:06d}.csv.gz'.format(i)) with open_compressed(file_out, 'wb') as f: writer = csv.writer(f) writer.writerow(header) empty = file_out for j in xrange(chunk_size): data = rows.next().iter_data() schema = izip(data['observed'], fields, decoders) row = [ decode(data[field].next()) if observed else '' for observed, field, decode in schema ] writer.writerow(row) empty = None except StopIteration: if empty: os.remove(empty)
def relate(self, columns, result_out, sample_count=1000): """ Compute pairwise related scores between all pairs of columns in columns. Related scores are defined to be: Related(X, Y) = I(X; Y) / H(X, Y) Where: I(X; Y) is the mutual information between X and Y: I(X; Y) = E[ log( p(x, y)) / ( p(x) p(y) ) ]; x, y ~ p(x, y) H(X) is the entropy of X: H(X) = E[ log( p(x) )]; x ~ p(x) Expectations are estimated via monte carlo with `sample_count` samples """ with open_compressed(result_out, 'w') as f: writer = csv.writer(f) writer.writerow(self.feature_names) for target_column in set(columns): out_row = [target_column] to_sample1 = self.cols_to_sample([target_column]) for to_relate in self.feature_names: to_sample2 = self.cols_to_sample([to_relate]) mi = self.query_server.mutual_information( to_sample1, to_sample2, sample_count=sample_count).mean joined = [to_relate, target_column] to_sample_both = self.cols_to_sample(joined) joint_entropy = self.query_server.entropy( to_sample_both, sample_count=sample_count).mean normalized_mi = self.normalize_mutual_information( mi, joint_entropy) out_row.append(normalized_mi) writer.writerow(out_row)
def import_schema(): schema = [line.strip() for line in open_compressed(VOCAB)] with open_compressed(SCHEMA_CSV, 'w') as outfile: writer = csv.writer(outfile) writer.writerow(schema) return schema
def csv_writer(filename): with open_compressed(filename, 'wb') as f: yield csv.writer(f)
def pickle_dump(data, filename): with open_compressed(filename, 'wb') as f: pickle.dump(data, f)
def pickle_load(filename): with open_compressed(filename, 'rb') as f: return pickle.load(f)
def _load_checkpoint(step): message = loom.schema_pb2.Checkpoint() filename = checkpoint_files(step)['checkpoint'] with open_compressed(filename) as f: message.ParseFromString(f.read()) return message
def csv_input(arg): if hasattr(arg, 'read'): yield csv.reader(arg) else: with open_compressed(arg, 'rb') as infile: yield csv.reader(infile)
def get_example_requests(model, rows, query_type='mixed'): assert query_type in ['sample', 'score', 'mixed'] cross_cat = CrossCat() with open_compressed(model, 'rb') as f: cross_cat.ParseFromString(f.read()) feature_count = sum(len(kind.featureids) for kind in cross_cat.kinds) featureids = range(feature_count) nontrivials = [True] * feature_count for kind in cross_cat.kinds: fs = iter(kind.featureids) for model in loom.schema.MODELS.iterkeys(): for shared in getattr(kind.product_model, model): f = fs.next() if model == 'dd': if len(shared.alphas) == 0: nontrivials[f] = False elif model == 'dpd': if len(shared.betas) == 0: nontrivials[f] = False all_observed = nontrivials[:] none_observed = [False] * feature_count observeds = [] observeds.append(all_observed) for f, nontrivial in izip(featureids, nontrivials): if nontrivial: observed = all_observed[:] observed[f] = False observeds.append(observed) for f in featureids: observed = [ nontrivial and sample_bernoulli(0.5) for nontrivial in nontrivials ] observeds.append(observed) for f, nontrivial in izip(featureids, nontrivials): if nontrivial: observed = none_observed[:] observed[f] = True observeds.append(observed) observeds.append(none_observed) requests = [] for i, observed in enumerate(observeds): request = Query.Request() request.id = "example-{}".format(i) if query_type in ['sample', 'mixed']: set_diff(request.sample.data, none_observed) request.sample.to_sample.sparsity = DENSE request.sample.to_sample.dense[:] = observed request.sample.sample_count = 1 if query_type in ['score', 'mixed']: set_diff(request.score.data, none_observed) requests.append(request) for row in load_rows(rows)[:20]: i += 1 request = Query.Request() request.id = "example-{}".format(i) if query_type in ['sample', 'mixed']: request.sample.sample_count = 1 request.sample.data.MergeFrom(row.diff) request.sample.to_sample.sparsity = DENSE conditions = izip(nontrivials, row.diff.pos.observed.dense) to_sample = [ nontrivial and not is_observed for nontrivial, is_observed in conditions ] set_observed(request.sample.to_sample, to_sample) if query_type in ['score', 'mixed']: request.score.data.MergeFrom(row.diff) requests.append(request) return requests
def csv_reader(filename): with open_compressed(filename, 'rb') as f: yield csv.reader(f)
def csv_load(filename): with open_compressed(filename) as f: reader = csv.reader(f) return list(reader)
def generate_one((name, sample_count, force, debug)): paths = loom.store.get_paths(name, sample_count=sample_count) if not force and all(os.path.exists(f) for f in paths.itervalues()): with open_compressed(paths['ingest']['version']) as f: version = f.read().strip() if version == loom.__version__: return print 'generating', name mkdir_p(paths['root']) with open_compressed(paths['ingest']['version'], 'w') as f: f.write(loom.__version__) config = CONFIGS[name] chunk_size = max(10, (config['row_count'] + 7) / 8) loom.transforms.make_fake_transforms( transforms_out=paths['ingest']['transforms']) loom.generate.generate( init_out=paths['samples'][0]['init'], rows_out=paths['ingest']['rows'], model_out=paths['samples'][0]['model'], groups_out=paths['samples'][0]['groups'], assign_out=paths['samples'][0]['assign'], **config) loom.format.make_schema( model_in=paths['samples'][0]['model'], schema_out=paths['ingest']['schema']) loom.format.make_fake_encoding( schema_in=paths['ingest']['schema'], model_in=paths['samples'][0]['model'], encoding_out=paths['ingest']['encoding']) loom.format.make_schema_row( schema_in=paths['ingest']['schema'], schema_row_out=paths['ingest']['schema_row']) loom.runner.tare( schema_row_in=paths['ingest']['schema_row'], rows_in=paths['ingest']['rows'], tares_out=paths['ingest']['tares'], debug=debug) loom.runner.sparsify( schema_row_in=paths['ingest']['schema_row'], tares_in=paths['ingest']['tares'], rows_in=paths['ingest']['rows'], rows_out=paths['ingest']['diffs'], debug=debug) loom.format.export_rows( encoding_in=paths['ingest']['encoding'], rows_in=paths['ingest']['rows'], rows_csv_out=paths['ingest']['rows_csv'], chunk_size=chunk_size) loom.format.import_rowids( rows_csv_in=paths['ingest']['rows_csv'], rowids_out=paths['ingest']['rowids'], id_field='_id') protobuf_stream_dump([], paths['query']['query_log']) loom.config.config_dump({}, paths['query']['config']) for seed, sample in enumerate(paths['samples']): loom.config.config_dump({'seed': seed}, sample['config']) loom.generate.generate_init( encoding_in=paths['ingest']['encoding'], model_out=sample['init'], seed=seed) loom.runner.shuffle( rows_in=paths['ingest']['diffs'], rows_out=sample['shuffled'], seed=seed, debug=debug) protobuf_stream_dump([], sample['infer_log']) sample0 = paths['samples'][0] for seed, sample in enumerate(paths['samples'][1:]): if LOOM_DEBUG_MIX: cp_ns(sample0['model'], sample['model']) cp_ns(sample0['groups'], sample['groups']) cp_ns(sample0['assign'], sample['assign']) else: loom.runner.mix( config_in=sample['config'], rows_in=paths['ingest']['rows'], model_in=sample0['model'], groups_in=sample0['groups'], assign_in=sample0['assign'], model_out=sample['model'], groups_out=sample['groups'], assign_out=sample['assign'], debug=debug) loom.consensus.make_fake_consensus( paths=paths, debug=debug)
def csv_dump(data, filename): with open_compressed(filename, 'w') as f: writer = csv.writer(f) for row in data: writer.writerow(row)