def make_fake_encoding(schema_in, model_in, encoding_out): ''' Make a fake encoding from json schema + model. Assume that feature names in schema correspond to featureids in model e.g. schema was generated from loom.format.make_schema ''' schema = json_load(schema_in) fields = [] builders = [] name_to_builder = {} for name, model in sorted(schema.iteritems()): fields.append(loom.schema.MODEL_TO_DATATYPE[model]) Builder = FAKE_ENCODER_BUILDERS[model] builder = Builder(name, model) builders.append(builder) name_to_builder[name] = builder cross_cat = loom.schema_pb2.CrossCat() with open_compressed(model_in, 'rb') as f: cross_cat.ParseFromString(f.read()) for kind in cross_cat.kinds: featureid = iter(kind.featureids) for model in loom.schema.MODELS.iterkeys(): for shared in getattr(kind.product_model, model): feature_name = '{:06d}'.format(featureid.next()) assert feature_name in schema if model == 'dd': for i in range(len(shared.alphas)): name_to_builder[feature_name].add_value(str(i)) elif model == 'dpd': for val in shared.values: name_to_builder[feature_name].add_value(str(val)) encoders = [b.build() for b in builders] ensure_fake_encoders_are_sorted(encoders) json_dump(encoders, encoding_out)
def load(name, schema, rows_csv): ''' Load a csv dataset for testing and benchmarking. ''' assert os.path.exists(schema) assert schema.endswith('.json') assert os.path.exists(rows_csv) if os.path.isfile(rows_csv): assert rows_csv.endswith('.csv') or rows_csv.endswith('.csv.gz') else: assert os.path.isdir(rows_csv) paths = loom.store.get_paths(name) assert not os.path.exists(paths['root']), 'dataset already loaded' json_dump(json_load(schema), paths['ingest']['schema']) loom.format.make_schema_row( schema_in=paths['ingest']['schema'], schema_row_out=paths['ingest']['schema_row']) if os.path.isdir(rows_csv): os.symlink(rows_csv, paths['ingest']['rows_csv']) else: os.makedirs(paths['ingest']['rows_csv']) os.symlink( rows_csv, os.path.join( paths['ingest']['rows_csv'], os.path.basename(rows_csv)))
def _test_modify_schema(modify, name, schema, rows_csv, **unused): with tempdir(cleanup_on_error=CLEANUP_ON_ERROR) as store: with mock.patch('loom.store.STORE', new=store): modified_schema = os.path.join(store, 'schema.json') data = json_load(schema) data = modify(data) json_dump(data, modified_schema) loom.tasks.ingest(name, modified_schema, rows_csv, debug=True)
def make_encoding(schema_in, rows_in, encoding_out): ''' Make a row encoder from csv rows data + json schema. ''' if os.path.isdir(rows_in): builders = _make_encoder_builders_dir(schema_in, rows_in) else: builders = _make_encoder_builders_file((schema_in, rows_in)) encoders = [builder.build() for builder in builders] encoders.sort(key=get_encoder_rank) json_dump(encoders, encoding_out)
def make_transforms(schema_in, rows_in, schema_out, transforms_out): fluent_schema = load_schema(schema_in) basic_schema = {} pre_transforms = [] transforms = [] builders = [] dates = [ feature_name for feature_name, fluent_type in fluent_schema.iteritems() if fluent_type.endswith('date') ] id_field = None for feature_name, fluent_type in fluent_schema.iteritems(): # parse adjectives if fluent_type.startswith('optional_'): transform = PresenceTransform(feature_name) pre_transforms.append(transform) transforms.append(transform) fluent_type = fluent_type[len('optional_'):] feature_name = '{}.value'.format(feature_name) # parse nouns if fluent_type == 'id': id_field = feature_name elif fluent_type in ['categorical', 'unbounded_categorical']: transforms.append(StringTransform(feature_name, fluent_type)) elif fluent_type == 'percent': transforms.append(PercentTransform(feature_name)) elif fluent_type == 'sparse_real': transforms.append(SparseRealTransform(feature_name)) elif fluent_type == 'text': builders.append(TextTransformBuilder(feature_name)) elif fluent_type == 'tags': builders.append( TextTransformBuilder(feature_name, allow_empty=True)) elif fluent_type == 'date': relatives = [other for other in dates if other < feature_name] transforms.append(DateTransform(feature_name, relatives)) else: basic_type = FLUENT_TO_BASIC[fluent_type] basic_schema[feature_name] = basic_type if builders: transforms += build_transforms(rows_in, pre_transforms, builders) for transform in transforms: basic_schema.update(transform.get_schema()) json_dump(basic_schema, schema_out) pickle_dump(transforms, transforms_out) LOG('transformed {} -> {} features'.format( len(fluent_schema), len(basic_schema))) return id_field
def crossvalidate_one(seed, test_count, train_count, inputs, results, extra_passes, debug): LOG('running seed {}:'.format(seed)) results['train'] = os.path.join(results['root'], 'train', 'diffs.pbs.gz') results['test'] = os.path.join(results['root'], 'test', 'rows.pbs.gz') results['scores'] = os.path.join(results['root'], 'scores.json.gz') config = { 'seed': seed, 'schedule': { 'extra_passes': extra_passes }, } loom.config.config_dump(config, results['samples'][0]['config']) numpy.random.seed(seed) split = [True] * train_count + [False] * test_count numpy.random.shuffle(split) diffs_in = protobuf_stream_load(inputs['ingest']['diffs']) protobuf_stream_dump((row for s, row in izip(split, diffs_in) if s), results['train']) rows_in = protobuf_stream_load(inputs['ingest']['rows']) protobuf_stream_dump((row for s, row in izip(split, rows_in) if not s), results['test']) LOG(' shuffle') loom.runner.shuffle(rows_in=results['train'], rows_out=results['samples'][0]['shuffled'], seed=seed, debug=debug) LOG(' init') loom.generate.generate_init(encoding_in=inputs['ingest']['encoding'], model_out=results['samples'][0]['init'], seed=seed) LOG(' infer') loom.runner.infer(config_in=results['samples'][0]['config'], rows_in=results['samples'][0]['shuffled'], tares_in=inputs['ingest']['tares'], model_in=results['samples'][0]['init'], model_out=results['samples'][0]['model'], groups_out=results['samples'][0]['groups'], debug=debug) LOG(' query') rows = loom.query.load_data_rows(results['test']) loom.config.config_dump({}, results['query']['config']) with loom.query.get_server(results['root'], debug=debug) as query: scores = [query.score(row) for row in rows] json_dump(scores, results['scores']) LOG(' done\n') return numpy.mean(scores)
def make_schema(model_in, schema_out): ''' Make a schema from a protobuf model. ''' cross_cat = loom.schema_pb2.CrossCat() with open_compressed(model_in, 'rb') as f: cross_cat.ParseFromString(f.read()) schema = {} for kind in cross_cat.kinds: featureid = iter(kind.featureids) for model in loom.schema.MODELS.iterkeys(): for shared in getattr(kind.product_model, model): feature_name = '{:06d}'.format(featureid.next()) schema[feature_name] = model json_dump(schema, schema_out) return schema
def crossvalidate( name=None, sample_count=10, portion=0.9, extra_passes=loom.config.DEFAULTS['schedule']['extra_passes'], debug=False): ''' Randomly split dataset; train models; score held-out data. ''' assert 0 < portion and portion < 1, portion assert sample_count > 0, sample_count loom.store.require(name, [ 'ingest.encoding', 'ingest.tares', 'ingest.diffs', ]) inputs = loom.store.get_paths(name) row_count = sum(1 for _ in protobuf_stream_load(inputs['ingest']['diffs'])) assert row_count > 1, 'too few rows to crossvalidate: {}'.format(row_count) train_count = max(1, min(row_count - 1, int(round(portion * row_count)))) test_count = row_count - train_count assert 1 <= train_count and 1 <= test_count mean_scores = [] for seed in xrange(sample_count): results = loom.store.get_paths( os.path.join(name, 'crossvalidate/{}'.format(seed))) mean = crossvalidate_one( seed, test_count, train_count, inputs, results, extra_passes, debug) mean_scores.append(mean) results = loom.store.get_paths(os.path.join(name, 'crossvalidate')) results['scores'] = os.path.join(results['root'], 'scores.json.gz') json_dump(mean_scores, results['scores']) print 'score = {} +- {}'.format( numpy.mean(mean_scores), numpy.std(mean_scores))
def make_fake_encoding(schema_in, rows_in, encoding_out): ''' Make a fake encoding from json schema + protobuf rows. ''' schema = json_load(schema_in) fields = [] builders = [] for name, model in sorted(schema.iteritems()): fields.append(loom.schema.MODEL_TO_DATATYPE[model]) Builder = FAKE_ENCODER_BUILDERS[model] builder = Builder(name, model) builders.append(builder) for row in loom.cFormat.row_stream_load(rows_in): data = row.iter_data() observeds = data['observed'] for observed, field, builder in izip(observeds, fields, builders): if observed: builder.add_value(str(data[field].next())) encoders = [builder.build() for builder in builders] ensure_fake_encoders_are_sorted(encoders) json_dump(encoders, encoding_out)
def crossvalidate( name=None, sample_count=10, portion=0.9, extra_passes=loom.config.DEFAULTS['schedule']['extra_passes'], debug=False): ''' Randomly split dataset; train models; score held-out data. ''' assert 0 < portion and portion < 1, portion assert sample_count > 0, sample_count loom.store.require(name, [ 'ingest.encoding', 'ingest.tares', 'ingest.diffs', ]) inputs = loom.store.get_paths(name) row_count = sum(1 for _ in protobuf_stream_load(inputs['ingest']['diffs'])) assert row_count > 1, 'too few rows to crossvalidate: {}'.format(row_count) train_count = max(1, min(row_count - 1, int(round(portion * row_count)))) test_count = row_count - train_count assert 1 <= train_count and 1 <= test_count mean_scores = [] for seed in xrange(sample_count): results = loom.store.get_paths( os.path.join(name, 'crossvalidate/{}'.format(seed))) mean = crossvalidate_one(seed, test_count, train_count, inputs, results, extra_passes, debug) mean_scores.append(mean) results = loom.store.get_paths(os.path.join(name, 'crossvalidate')) results['scores'] = os.path.join(results['root'], 'scores.json.gz') json_dump(mean_scores, results['scores']) print 'score = {} +- {}'.format(numpy.mean(mean_scores), numpy.std(mean_scores))
def find_consensus_grouping(groupings, debug=False): ''' This implements Strehl et al's Meta-Clustering Algorithm [1]. Inputs: groupings - a list of lists of lists of object ids, for example [ [ # sample 0 [0, 1, 2], # sample 0, group 0 [3, 4], # sample 0, group 1 [5] # sample 0, group 2 ], [ # sample 1 [0, 1], # sample 1, group 0 [2, 3, 4, 5] # sample 1, group 1 ] ] Returns: a list of Row instances sorted by (- row.group_id, row.confidence) References: [1] Alexander Strehl, Joydeep Ghosh, Claire Cardie (2002) "Cluster Ensembles - A Knowledge Reuse Framework for Combining Multiple Partitions" Journal of Machine Learning Research http://jmlr.csail.mit.edu/papers/volume3/strehl02a/strehl02a.pdf ''' if not groupings: raise LoomError('tried to find consensus among zero groupings') # ------------------------------------------------------------------------ # Set up consensus grouping problem allgroups = sum(groupings, []) objects = list(set(sum(allgroups, []))) objects.sort() index = {item: i for i, item in enumerate(objects)} vertices = [ numpy.array(map(index.__getitem__, g), dtype=numpy.intp) for g in allgroups ] contains = numpy.zeros((len(vertices), len(objects)), dtype=numpy.float32) for v, vertex in enumerate(vertices): contains[v, vertex] = 1 # i.e. for u in vertex: contains[v, u] = i # We use the binary Jaccard measure for similarity overlap = numpy.dot(contains, contains.T) diag = overlap.diagonal() denom = (diag.reshape(len(vertices), 1) + diag.reshape(1, len(vertices)) - overlap) similarity = overlap / denom # ------------------------------------------------------------------------ # Format for metis if not (similarity.max() <= 1): raise LoomError('similarity.max() = {}'.format(similarity.max())) similarity *= 2**16 # metis segfaults if this is too large int_similarity = numpy.zeros(similarity.shape, dtype=numpy.int32) int_similarity[:] = numpy.rint(similarity) edges = int_similarity.nonzero() edge_weights = map(int, int_similarity[edges]) edges = numpy.transpose(edges) adjacency = [[] for _ in vertices] for i, j in edges: adjacency[i].append(j) # FIXME is there a better way to choose the final group count? group_count = int(numpy.median(map(len, groupings))) metis_args = { 'nparts': group_count, 'adjacency': adjacency, 'eweights': edge_weights, } if debug: json_dump(metis_args, METIS_ARGS_TEMPFILE, indent=4) edge_cut, partition = pymetis.part_graph(**metis_args) if debug: os.remove(METIS_ARGS_TEMPFILE) # ------------------------------------------------------------------------ # Clean up solution parts = range(group_count) if len(partition) != len(vertices): raise LoomError('metis output vector has wrong length') represents = numpy.zeros((len(parts), len(vertices))) for v, p in enumerate(partition): represents[p, v] = 1 contains = numpy.dot(represents, contains) represent_counts = represents.sum(axis=1) represent_counts[numpy.where(represent_counts == 0)] = 1 # avoid NANs contains /= represent_counts.reshape(group_count, 1) bestmatch = contains.argmax(axis=0) confidence = contains[bestmatch, range(len(bestmatch))] if not all(numpy.isfinite(confidence)): raise LoomError('confidence is nan') nonempty_groups = list(set(bestmatch)) nonempty_groups.sort() reindex = {j: i for i, j in enumerate(nonempty_groups)} grouping = [ Row(row_id=objects[i], group_id=reindex[g], confidence=c) for i, (g, c) in enumerate(izip(bestmatch, confidence)) ] groups = collate((row.group_id, row) for row in grouping) groups.sort(key=len, reverse=True) grouping = [ Row(row_id=row.row_id, group_id=group_id, confidence=row.confidence) for group_id, group in enumerate(groups) for row in group ] grouping.sort(key=lambda x: (x.group_id, -x.confidence, x.row_id)) return grouping
def find_consensus_grouping(groupings, debug=False): ''' This implements Strehl et al's Meta-Clustering Algorithm [1]. Inputs: groupings - a list of lists of lists of object ids, for example [ [ # sample 0 [0, 1, 2], # sample 0, group 0 [3, 4], # sample 0, group 1 [5] # sample 0, group 2 ], [ # sample 1 [0, 1], # sample 1, group 0 [2, 3, 4, 5] # sample 1, group 1 ] ] Returns: a list of Row instances sorted by (- row.group_id, row.confidence) References: [1] Alexander Strehl, Joydeep Ghosh, Claire Cardie (2002) "Cluster Ensembles - A Knowledge Reuse Framework for Combining Multiple Partitions" Journal of Machine Learning Research http://jmlr.csail.mit.edu/papers/volume3/strehl02a/strehl02a.pdf ''' if not groupings: raise LoomError('tried to find consensus among zero groupings') # ------------------------------------------------------------------------ # Set up consensus grouping problem allgroups = sum(groupings, []) objects = list(set(sum(allgroups, []))) objects.sort() index = {item: i for i, item in enumerate(objects)} vertices = [numpy.array(map(index.__getitem__, g), dtype=numpy.intp) for g in allgroups] contains = numpy.zeros((len(vertices), len(objects)), dtype=numpy.float32) for v, vertex in enumerate(vertices): contains[v, vertex] = 1 # i.e. for u in vertex: contains[v, u] = i # We use the binary Jaccard measure for similarity overlap = numpy.dot(contains, contains.T) diag = overlap.diagonal() denom = (diag.reshape(len(vertices), 1) + diag.reshape(1, len(vertices)) - overlap) similarity = overlap / denom # ------------------------------------------------------------------------ # Format for metis if not (similarity.max() <= 1): raise LoomError('similarity.max() = {}'.format(similarity.max())) similarity *= 2**16 # metis segfaults if this is too large int_similarity = numpy.zeros(similarity.shape, dtype=numpy.int32) int_similarity[:] = numpy.rint(similarity) edges = int_similarity.nonzero() edge_weights = map(int, int_similarity[edges]) edges = numpy.transpose(edges) adjacency = [[] for _ in vertices] for i, j in edges: adjacency[i].append(j) # FIXME is there a better way to choose the final group count? group_count = int(numpy.median(map(len, groupings))) metis_args = { 'nparts': group_count, 'adjacency': adjacency, 'eweights': edge_weights, } if debug: json_dump(metis_args, METIS_ARGS_TEMPFILE, indent=4) edge_cut, partition = pymetis.part_graph(**metis_args) if debug: os.remove(METIS_ARGS_TEMPFILE) # ------------------------------------------------------------------------ # Clean up solution parts = range(group_count) if len(partition) != len(vertices): raise LoomError('metis output vector has wrong length') represents = numpy.zeros((len(parts), len(vertices))) for v, p in enumerate(partition): represents[p, v] = 1 contains = numpy.dot(represents, contains) represent_counts = represents.sum(axis=1) represent_counts[numpy.where(represent_counts == 0)] = 1 # avoid NANs contains /= represent_counts.reshape(group_count, 1) bestmatch = contains.argmax(axis=0) confidence = contains[bestmatch, range(len(bestmatch))] if not all(numpy.isfinite(confidence)): raise LoomError('confidence is nan') nonempty_groups = list(set(bestmatch)) nonempty_groups.sort() reindex = {j: i for i, j in enumerate(nonempty_groups)} grouping = [ Row(row_id=objects[i], group_id=reindex[g], confidence=c) for i, (g, c) in enumerate(izip(bestmatch, confidence)) ] groups = collate((row.group_id, row) for row in grouping) groups.sort(key=len, reverse=True) grouping = [ Row(row_id=row.row_id, group_id=group_id, confidence=row.confidence) for group_id, group in enumerate(groups) for row in group ] grouping.sort(key=lambda x: (x.group_id, -x.confidence, x.row_id)) return grouping
def crossvalidate_one( seed, test_count, train_count, inputs, results, extra_passes, debug): LOG('running seed {}:'.format(seed)) results['train'] = os.path.join( results['root'], 'train', 'diffs.pbs.gz') results['test'] = os.path.join(results['root'], 'test', 'rows.pbs.gz') results['scores'] = os.path.join(results['root'], 'scores.json.gz') config = { 'seed': seed, 'schedule': {'extra_passes': extra_passes}, } loom.config.config_dump(config, results['samples'][0]['config']) numpy.random.seed(seed) split = [True] * train_count + [False] * test_count numpy.random.shuffle(split) diffs_in = protobuf_stream_load(inputs['ingest']['diffs']) protobuf_stream_dump( (row for s, row in izip(split, diffs_in) if s), results['train']) rows_in = protobuf_stream_load(inputs['ingest']['rows']) protobuf_stream_dump( (row for s, row in izip(split, rows_in) if not s), results['test']) LOG(' shuffle') loom.runner.shuffle( rows_in=results['train'], rows_out=results['samples'][0]['shuffled'], seed=seed, debug=debug) LOG(' init') loom.generate.generate_init( encoding_in=inputs['ingest']['encoding'], model_out=results['samples'][0]['init'], seed=seed) LOG(' infer') loom.runner.infer( config_in=results['samples'][0]['config'], rows_in=results['samples'][0]['shuffled'], tares_in=inputs['ingest']['tares'], model_in=results['samples'][0]['init'], model_out=results['samples'][0]['model'], groups_out=results['samples'][0]['groups'], debug=debug) LOG(' query') rows = loom.query.load_data_rows(results['test']) loom.config.config_dump({}, results['query']['config']) with loom.query.get_server(results['root'], debug=debug) as query: scores = [query.score(row) for row in rows] json_dump(scores, results['scores']) LOG(' done\n') return numpy.mean(scores)