def _make_encoder_builders_file((schema_in, rows_in)): assert os.path.isfile(rows_in) schema = json_load(schema_in) with csv_reader(rows_in) as reader: header = reader.next() builders = [] seen = set() for name in header: if name in schema: if name in seen: raise LoomError('Repeated column {} in csv file {}'.format( name, rows_in)) seen.add(name) model = schema[name] Builder = ENCODER_BUILDERS[model] builder = Builder(name, model) else: builder = None builders.append(builder) if all(builder is None for builder in builders): raise LoomError('Csv file has no known features;' ', try adding a header to {}'.format(rows_in)) missing_features = sorted(set(schema) - seen) if missing_features: raise LoomError('\n '.join(['Csv file is missing features:'] + missing_features)) for row in reader: for value, builder in izip(row, builders): if builder is not None: value = value.strip() if value: builder.add_value(value) return [b for b in builders if b is not None]
def ingest(name, schema=None, rows_csv=None, id_field=None, debug=False): ''' Ingest dataset with optional json config. Arguments: name A unique identifier for ingest + inference schema Json schema file, e.g., {"feature1": "nich"} rows_csv File or directory of csv files or csv.gz files id_field Column name of id field in input csv debug Whether to run debug versions of C++ code Environment variables: LOOM_THREADS Number of concurrent ingest tasks LOOM_VERBOSITY Verbosity level ''' paths = loom.store.get_paths(name) if schema is None: schema = paths['ingest']['schema'] if rows_csv is None: rows_csv = paths['ingest']['rows_csv'] if not os.path.exists(schema): raise LoomError('Missing schema file: {}'.format(schema)) if not os.path.exists(rows_csv): raise LoomError('Missing rows_csv file: {}'.format(rows_csv)) with open_compressed(paths['ingest']['version'], 'w') as f: f.write(loom.__version__) LOG('making schema row') loom.format.make_schema_row(schema_in=schema, schema_row_out=paths['ingest']['schema_row']) LOG('making encoding') loom.format.make_encoding(schema_in=schema, rows_in=rows_csv, encoding_out=paths['ingest']['encoding']) LOG('importing rows') loom.format.import_rows(encoding_in=paths['ingest']['encoding'], rows_csv_in=rows_csv, rows_out=paths['ingest']['rows']) LOG('importing rowids') loom.format.import_rowids(rows_csv_in=rows_csv, rowids_out=paths['ingest']['rowids'], id_field=id_field) LOG('making tare rows') loom.runner.tare(schema_row_in=paths['ingest']['schema_row'], rows_in=paths['ingest']['rows'], tares_out=paths['ingest']['tares'], debug=debug) tare_count = sum(1 for _ in protobuf_stream_load(paths['ingest']['tares'])) LOG('sparsifying rows WRT {} tare rows'.format(tare_count)) loom.runner.sparsify(schema_row_in=paths['ingest']['schema_row'], tares_in=paths['ingest']['tares'], rows_in=paths['ingest']['rows'], rows_out=paths['ingest']['diffs'], debug=debug) loom.config.config_dump({}, paths['query']['config'])
def make_consensus(name, config=None, debug=False): ''' Combine samples into a single consensus sample. Arguments: name A unique identifier for consensus config An optional json config file currently doesn't do anything but will be used to support e.g. cluster coarseness in the future debug Whether to run debug versions of C++ code Environment varibles: LOOM_VERBOSITY Verbosity level ''' paths = loom.store.get_paths(name) LOG('making config') if config is None: config = {} elif isinstance(config, basestring): if not os.path.exists(config): raise LoomError('Missing config file: {}'.format(config)) config = json_load(config) else: config = copy.deepcopy(config) loom.config.config_dump(config, paths['samples'][0]['config']) LOG('finding consensus') loom.consensus.make_consensus(paths=paths, debug=debug)
def make_schema_row(schema_in, schema_row_out): ''' Convert json schema to protobuf schema row. ''' schema = json_load(schema_in) if not schema: raise LoomError('Schema is empty: {}'.format(schema_in)) value = loom.schema_pb2.ProductValue() value.observed.sparsity = loom.schema_pb2.ProductValue.Observed.DENSE for model in schema.itervalues(): try: field = loom.schema.MODEL_TO_DATATYPE[model] except KeyError: raise LoomError('Unknown model {} in schema {}'.format( model, schema_in)) value.observed.dense.append(True) getattr(value, field).append(EXAMPLE_VALUES[field]) with open_compressed(schema_row_out, 'wb') as f: f.write(value.SerializeToString())
def transform( name, schema_csv='schema.csv', rows_csv='rows.csv.gz'): ''' Transform dataset from fluent format to loom internal format. Arguments: name A unique identifier for ingest + inference schema_csv Schema file with columns [feature_name, datatype], e.g. Feature Name,Type full name,id start date,optional_date age,real zipcode,unbounded_categorical description,text Loom assumes the first line is a header and ignores it. Features without datatypes are ignored. rows_csv File or directory of csv files or csv.gz files Environment variables: LOOM_THREADS Number of concurrent ingest tasks LOOM_VERBOSITY Verbosity level ''' if not os.path.exists(schema_csv): raise LoomError('Missing schema_csv file: {}'.format(schema_csv)) if not os.path.exists(rows_csv): raise LoomError('Missing rows_csv file: {}'.format(rows_csv)) paths = loom.store.get_paths(name) LOG('making transforms') id_field = loom.transforms.make_transforms( schema_in=schema_csv, rows_in=rows_csv, schema_out=paths['ingest']['schema'], transforms_out=paths['ingest']['transforms']) LOG('transforming rows') loom.transforms.transform_rows( schema_in=paths['ingest']['schema'], transforms_in=paths['ingest']['transforms'], rows_in=rows_csv, rows_out=paths['ingest']['rows_csv'], id_field=id_field)
def export_rows(encoding_in, rows_in, rows_csv_out, chunk_size=1000000): ''' Export rows from gzipped-protobuf-stream to directory-of-gzipped-csv-files. ''' rows_csv_out = os.path.abspath(rows_csv_out) if rows_csv_out == os.getcwd(): raise LoomError('Cannot export_rows to working directory') for ext in ['.csv', '.gz', '.bz2']: if rows_csv_out.endswith(ext): raise LoomError( 'Expected rows_csv_out to be a dirname, actual'.format( rows_csv_out)) if not (chunk_size > 0): raise LoomError('Invalid chunk_size {}, must be positive'.format( chunk_size)) encoders = json_load(encoding_in) fields = [loom.schema.MODEL_TO_DATATYPE[e['model']] for e in encoders] decoders = [load_decoder(e) for e in encoders] header = ['_id'] + [e['name'] for e in encoders] if os.path.exists(rows_csv_out): shutil.rmtree(rows_csv_out) os.makedirs(rows_csv_out) row_count = sum(1 for _ in protobuf_stream_load(rows_in)) rows = loom.cFormat.row_stream_load(rows_in) chunk_count = (row_count + chunk_size - 1) / chunk_size chunks = sorted( os.path.join(rows_csv_out, 'rows.{}.csv.gz'.format(i)) for i in xrange(chunk_count) ) with ExitStack() as stack: with_ = stack.enter_context writers = [with_(csv_writer(f)) for f in chunks] for writer in writers: writer.writerow(header) for row, writer in izip(rows, cycle(writers)): data = row.iter_data() schema = izip(data['observed'], fields, decoders) csv_row = [row.id] for observed, field, decode in schema: csv_row.append(decode(data[field].next()) if observed else '') writer.writerow(csv_row)
def infer_one(name, seed=0, config=None, debug=False): ''' Infer a single sample. Arguments: name A unique identifier for ingest + inference seed The seed, i.e., sample number typically 0-9 config An optional json config file, e.g., {"schedule": {"extra_passes": 500.0}} debug Whether to run debug versions of C++ code Environment variables: LOOM_VERBOSITY Verbosity level ''' paths = loom.store.get_paths(name, sample_count=(1 + seed)) sample = paths['samples'][seed] LOG('making config') if config is None: config = {} elif isinstance(config, basestring): if not os.path.exists(config): raise LoomError('Missing config file: {}'.format(config)) config = json_load(config) else: config = copy.deepcopy(config) if 'seed' not in config: config['seed'] = seed loom.config.config_dump(config, sample['config']) LOG('generating init') loom.generate.generate_init( encoding_in=paths['ingest']['encoding'], model_out=sample['init'], seed=seed) LOG('shuffling rows') loom.runner.shuffle( rows_in=paths['ingest']['diffs'], rows_out=sample['shuffled'], seed=seed, debug=debug) LOG('inferring, watch {}'.format(sample['infer_log'])) loom.runner.infer( config_in=sample['config'], rows_in=sample['shuffled'], tares_in=paths['ingest']['tares'], model_in=sample['init'], model_out=sample['model'], groups_out=sample['groups'], assign_out=sample['assign'], log_out=sample['infer_log'], debug=debug)
def rows(): for i, row in enumerate(reader): if len(row) != header_length: raise LoomError('row {} has wrong length {}:\n{}'.format( i, len(row), row)) message.id = id_offset + id_stride * i for pos, add, encode in schema: value = None if pos is None else row[pos].strip() observed = bool(value) message.add_observed(observed) if observed: add(encode(value)) yield message message.Clear()
def infer(name, sample_count=DEFAULTS['sample_count'], config=None, debug=False): ''' Infer samples in parallel. Arguments: name A unique identifier for ingest + inference sample_count The number of samples to draw, typically 10-100 config An optional json config file, e.g., {"schedule": {"extra_passes": 500.0}} debug Whether to run debug versions of C++ code Environment variables: LOOM_THREADS Number of concurrent inference tasks LOOM_VERBOSITY Verbosity level ''' if not (sample_count >= 1): raise LoomError('Too few samples: {}'.format(sample_count)) parallel_map(_infer_one, [(name, seed, config, debug) for seed in xrange(sample_count)])
def find_consensus_grouping(groupings, debug=False): ''' This implements Strehl et al's Meta-Clustering Algorithm [1]. Inputs: groupings - a list of lists of lists of object ids, for example [ [ # sample 0 [0, 1, 2], # sample 0, group 0 [3, 4], # sample 0, group 1 [5] # sample 0, group 2 ], [ # sample 1 [0, 1], # sample 1, group 0 [2, 3, 4, 5] # sample 1, group 1 ] ] Returns: a list of Row instances sorted by (- row.group_id, row.confidence) References: [1] Alexander Strehl, Joydeep Ghosh, Claire Cardie (2002) "Cluster Ensembles - A Knowledge Reuse Framework for Combining Multiple Partitions" Journal of Machine Learning Research http://jmlr.csail.mit.edu/papers/volume3/strehl02a/strehl02a.pdf ''' if not groupings: raise LoomError('tried to find consensus among zero groupings') # ------------------------------------------------------------------------ # Set up consensus grouping problem allgroups = sum(groupings, []) objects = list(set(sum(allgroups, []))) objects.sort() index = {item: i for i, item in enumerate(objects)} vertices = [ numpy.array(map(index.__getitem__, g), dtype=numpy.intp) for g in allgroups ] contains = numpy.zeros((len(vertices), len(objects)), dtype=numpy.float32) for v, vertex in enumerate(vertices): contains[v, vertex] = 1 # i.e. for u in vertex: contains[v, u] = i # We use the binary Jaccard measure for similarity overlap = numpy.dot(contains, contains.T) diag = overlap.diagonal() denom = (diag.reshape(len(vertices), 1) + diag.reshape(1, len(vertices)) - overlap) similarity = overlap / denom # ------------------------------------------------------------------------ # Format for metis if not (similarity.max() <= 1): raise LoomError('similarity.max() = {}'.format(similarity.max())) similarity *= 2**16 # metis segfaults if this is too large int_similarity = numpy.zeros(similarity.shape, dtype=numpy.int32) int_similarity[:] = numpy.rint(similarity) edges = int_similarity.nonzero() edge_weights = map(int, int_similarity[edges]) edges = numpy.transpose(edges) adjacency = [[] for _ in vertices] for i, j in edges: adjacency[i].append(j) # FIXME is there a better way to choose the final group count? group_count = int(numpy.median(map(len, groupings))) metis_args = { 'nparts': group_count, 'adjacency': adjacency, 'eweights': edge_weights, } if debug: json_dump(metis_args, METIS_ARGS_TEMPFILE, indent=4) edge_cut, partition = pymetis.part_graph(**metis_args) if debug: os.remove(METIS_ARGS_TEMPFILE) # ------------------------------------------------------------------------ # Clean up solution parts = range(group_count) if len(partition) != len(vertices): raise LoomError('metis output vector has wrong length') represents = numpy.zeros((len(parts), len(vertices))) for v, p in enumerate(partition): represents[p, v] = 1 contains = numpy.dot(represents, contains) represent_counts = represents.sum(axis=1) represent_counts[numpy.where(represent_counts == 0)] = 1 # avoid NANs contains /= represent_counts.reshape(group_count, 1) bestmatch = contains.argmax(axis=0) confidence = contains[bestmatch, range(len(bestmatch))] if not all(numpy.isfinite(confidence)): raise LoomError('confidence is nan') nonempty_groups = list(set(bestmatch)) nonempty_groups.sort() reindex = {j: i for i, j in enumerate(nonempty_groups)} grouping = [ Row(row_id=objects[i], group_id=reindex[g], confidence=c) for i, (g, c) in enumerate(izip(bestmatch, confidence)) ] groups = collate((row.group_id, row) for row in grouping) groups.sort(key=len, reverse=True) grouping = [ Row(row_id=row.row_id, group_id=group_id, confidence=row.confidence) for group_id, group in enumerate(groups) for row in group ] grouping.sort(key=lambda x: (x.group_id, -x.confidence, x.row_id)) return grouping