Esempio n. 1
0
def _make_encoder_builders_file((schema_in, rows_in)):
    assert os.path.isfile(rows_in)
    schema = json_load(schema_in)
    with csv_reader(rows_in) as reader:
        header = reader.next()
        builders = []
        seen = set()
        for name in header:
            if name in schema:
                if name in seen:
                    raise LoomError('Repeated column {} in csv file {}'.format(
                        name, rows_in))
                seen.add(name)
                model = schema[name]
                Builder = ENCODER_BUILDERS[model]
                builder = Builder(name, model)
            else:
                builder = None
            builders.append(builder)
        if all(builder is None for builder in builders):
            raise LoomError('Csv file has no known features;'
                            ', try adding a header to {}'.format(rows_in))
        missing_features = sorted(set(schema) - seen)
        if missing_features:
            raise LoomError('\n  '.join(['Csv file is missing features:'] +
                                        missing_features))
        for row in reader:
            for value, builder in izip(row, builders):
                if builder is not None:
                    value = value.strip()
                    if value:
                        builder.add_value(value)
    return [b for b in builders if b is not None]
Esempio n. 2
0
def ingest(name, schema=None, rows_csv=None, id_field=None, debug=False):
    '''
    Ingest dataset with optional json config.
    Arguments:
        name            A unique identifier for ingest + inference
        schema          Json schema file, e.g., {"feature1": "nich"}
        rows_csv        File or directory of csv files or csv.gz files
        id_field        Column name of id field in input csv
        debug           Whether to run debug versions of C++ code
    Environment variables:
        LOOM_THREADS    Number of concurrent ingest tasks
        LOOM_VERBOSITY  Verbosity level
    '''
    paths = loom.store.get_paths(name)
    if schema is None:
        schema = paths['ingest']['schema']
    if rows_csv is None:
        rows_csv = paths['ingest']['rows_csv']
    if not os.path.exists(schema):
        raise LoomError('Missing schema file: {}'.format(schema))
    if not os.path.exists(rows_csv):
        raise LoomError('Missing rows_csv file: {}'.format(rows_csv))

    with open_compressed(paths['ingest']['version'], 'w') as f:
        f.write(loom.__version__)

    LOG('making schema row')
    loom.format.make_schema_row(schema_in=schema,
                                schema_row_out=paths['ingest']['schema_row'])

    LOG('making encoding')
    loom.format.make_encoding(schema_in=schema,
                              rows_in=rows_csv,
                              encoding_out=paths['ingest']['encoding'])

    LOG('importing rows')
    loom.format.import_rows(encoding_in=paths['ingest']['encoding'],
                            rows_csv_in=rows_csv,
                            rows_out=paths['ingest']['rows'])

    LOG('importing rowids')
    loom.format.import_rowids(rows_csv_in=rows_csv,
                              rowids_out=paths['ingest']['rowids'],
                              id_field=id_field)

    LOG('making tare rows')
    loom.runner.tare(schema_row_in=paths['ingest']['schema_row'],
                     rows_in=paths['ingest']['rows'],
                     tares_out=paths['ingest']['tares'],
                     debug=debug)

    tare_count = sum(1 for _ in protobuf_stream_load(paths['ingest']['tares']))
    LOG('sparsifying rows WRT {} tare rows'.format(tare_count))
    loom.runner.sparsify(schema_row_in=paths['ingest']['schema_row'],
                         tares_in=paths['ingest']['tares'],
                         rows_in=paths['ingest']['rows'],
                         rows_out=paths['ingest']['diffs'],
                         debug=debug)
    loom.config.config_dump({}, paths['query']['config'])
Esempio n. 3
0
def make_consensus(name, config=None, debug=False):
    '''
    Combine samples into a single consensus sample.
    Arguments:
        name            A unique identifier for consensus
        config          An optional json config file
                            currently doesn't do anything but will be used to
                            support e.g. cluster coarseness in the future
        debug           Whether to run debug versions of C++ code
    Environment varibles:
        LOOM_VERBOSITY  Verbosity level
    '''
    paths = loom.store.get_paths(name)
    LOG('making config')
    if config is None:
        config = {}
    elif isinstance(config, basestring):
        if not os.path.exists(config):
            raise LoomError('Missing config file: {}'.format(config))
        config = json_load(config)
    else:
        config = copy.deepcopy(config)
    loom.config.config_dump(config, paths['samples'][0]['config'])

    LOG('finding consensus')
    loom.consensus.make_consensus(paths=paths, debug=debug)
Esempio n. 4
0
def make_schema_row(schema_in, schema_row_out):
    '''
    Convert json schema to protobuf schema row.
    '''
    schema = json_load(schema_in)
    if not schema:
        raise LoomError('Schema is empty: {}'.format(schema_in))
    value = loom.schema_pb2.ProductValue()
    value.observed.sparsity = loom.schema_pb2.ProductValue.Observed.DENSE
    for model in schema.itervalues():
        try:
            field = loom.schema.MODEL_TO_DATATYPE[model]
        except KeyError:
            raise LoomError('Unknown model {} in schema {}'.format(
                model, schema_in))
        value.observed.dense.append(True)
        getattr(value, field).append(EXAMPLE_VALUES[field])
    with open_compressed(schema_row_out, 'wb') as f:
        f.write(value.SerializeToString())
Esempio n. 5
0
def transform(
        name,
        schema_csv='schema.csv',
        rows_csv='rows.csv.gz'):
    '''
    Transform dataset from fluent format to loom internal format.
    Arguments:
        name            A unique identifier for ingest + inference
        schema_csv      Schema file with columns [feature_name, datatype], e.g.
                            Feature Name,Type
                            full name,id
                            start date,optional_date
                            age,real
                            zipcode,unbounded_categorical
                            description,text
                        Loom assumes the first line is a header and ignores it.
                        Features without datatypes are ignored.
        rows_csv        File or directory of csv files or csv.gz files
    Environment variables:
        LOOM_THREADS    Number of concurrent ingest tasks
        LOOM_VERBOSITY  Verbosity level
    '''
    if not os.path.exists(schema_csv):
        raise LoomError('Missing schema_csv file: {}'.format(schema_csv))
    if not os.path.exists(rows_csv):
        raise LoomError('Missing rows_csv file: {}'.format(rows_csv))

    paths = loom.store.get_paths(name)

    LOG('making transforms')
    id_field = loom.transforms.make_transforms(
        schema_in=schema_csv,
        rows_in=rows_csv,
        schema_out=paths['ingest']['schema'],
        transforms_out=paths['ingest']['transforms'])

    LOG('transforming rows')
    loom.transforms.transform_rows(
        schema_in=paths['ingest']['schema'],
        transforms_in=paths['ingest']['transforms'],
        rows_in=rows_csv,
        rows_out=paths['ingest']['rows_csv'],
        id_field=id_field)
Esempio n. 6
0
def export_rows(encoding_in, rows_in, rows_csv_out, chunk_size=1000000):
    '''
    Export rows from gzipped-protobuf-stream to directory-of-gzipped-csv-files.
    '''
    rows_csv_out = os.path.abspath(rows_csv_out)
    if rows_csv_out == os.getcwd():
        raise LoomError('Cannot export_rows to working directory')
    for ext in ['.csv', '.gz', '.bz2']:
        if rows_csv_out.endswith(ext):
            raise LoomError(
                'Expected rows_csv_out to be a dirname, actual'.format(
                    rows_csv_out))
    if not (chunk_size > 0):
        raise LoomError('Invalid chunk_size {}, must be positive'.format(
            chunk_size))
    encoders = json_load(encoding_in)
    fields = [loom.schema.MODEL_TO_DATATYPE[e['model']] for e in encoders]
    decoders = [load_decoder(e) for e in encoders]
    header = ['_id'] + [e['name'] for e in encoders]
    if os.path.exists(rows_csv_out):
        shutil.rmtree(rows_csv_out)
    os.makedirs(rows_csv_out)
    row_count = sum(1 for _ in protobuf_stream_load(rows_in))
    rows = loom.cFormat.row_stream_load(rows_in)
    chunk_count = (row_count + chunk_size - 1) / chunk_size
    chunks = sorted(
        os.path.join(rows_csv_out, 'rows.{}.csv.gz'.format(i))
        for i in xrange(chunk_count)
    )
    with ExitStack() as stack:
        with_ = stack.enter_context
        writers = [with_(csv_writer(f)) for f in chunks]
        for writer in writers:
            writer.writerow(header)
        for row, writer in izip(rows, cycle(writers)):
            data = row.iter_data()
            schema = izip(data['observed'], fields, decoders)
            csv_row = [row.id]
            for observed, field, decode in schema:
                csv_row.append(decode(data[field].next()) if observed else '')
            writer.writerow(csv_row)
Esempio n. 7
0
def infer_one(name, seed=0, config=None, debug=False):
    '''
    Infer a single sample.
    Arguments:
        name            A unique identifier for ingest + inference
        seed            The seed, i.e., sample number typically 0-9
        config          An optional json config file, e.g.,
                            {"schedule": {"extra_passes": 500.0}}
        debug           Whether to run debug versions of C++ code
    Environment variables:
        LOOM_VERBOSITY  Verbosity level
    '''
    paths = loom.store.get_paths(name, sample_count=(1 + seed))
    sample = paths['samples'][seed]

    LOG('making config')
    if config is None:
        config = {}
    elif isinstance(config, basestring):
        if not os.path.exists(config):
            raise LoomError('Missing config file: {}'.format(config))
        config = json_load(config)
    else:
        config = copy.deepcopy(config)
    if 'seed' not in config:
        config['seed'] = seed
    loom.config.config_dump(config, sample['config'])

    LOG('generating init')
    loom.generate.generate_init(
        encoding_in=paths['ingest']['encoding'],
        model_out=sample['init'],
        seed=seed)

    LOG('shuffling rows')
    loom.runner.shuffle(
        rows_in=paths['ingest']['diffs'],
        rows_out=sample['shuffled'],
        seed=seed,
        debug=debug)

    LOG('inferring, watch {}'.format(sample['infer_log']))
    loom.runner.infer(
        config_in=sample['config'],
        rows_in=sample['shuffled'],
        tares_in=paths['ingest']['tares'],
        model_in=sample['init'],
        model_out=sample['model'],
        groups_out=sample['groups'],
        assign_out=sample['assign'],
        log_out=sample['infer_log'],
        debug=debug)
Esempio n. 8
0
 def rows():
     for i, row in enumerate(reader):
         if len(row) != header_length:
             raise LoomError('row {} has wrong length {}:\n{}'.format(
                 i, len(row), row))
         message.id = id_offset + id_stride * i
         for pos, add, encode in schema:
             value = None if pos is None else row[pos].strip()
             observed = bool(value)
             message.add_observed(observed)
             if observed:
                 add(encode(value))
         yield message
         message.Clear()
Esempio n. 9
0
def infer(name,
          sample_count=DEFAULTS['sample_count'],
          config=None,
          debug=False):
    '''
    Infer samples in parallel.
    Arguments:
        name            A unique identifier for ingest + inference
        sample_count    The number of samples to draw, typically 10-100
        config          An optional json config file, e.g.,
                            {"schedule": {"extra_passes": 500.0}}
        debug           Whether to run debug versions of C++ code
    Environment variables:
        LOOM_THREADS    Number of concurrent inference tasks
        LOOM_VERBOSITY  Verbosity level
    '''
    if not (sample_count >= 1):
        raise LoomError('Too few samples: {}'.format(sample_count))
    parallel_map(_infer_one, [(name, seed, config, debug)
                              for seed in xrange(sample_count)])
Esempio n. 10
0
def find_consensus_grouping(groupings, debug=False):
    '''
    This implements Strehl et al's Meta-Clustering Algorithm [1].

    Inputs:
        groupings - a list of lists of lists of object ids, for example

            [
                [                   # sample 0
                    [0, 1, 2],      # sample 0, group 0
                    [3, 4],         # sample 0, group 1
                    [5]             # sample 0, group 2
                ],
                [                   # sample 1
                    [0, 1],         # sample 1, group 0
                    [2, 3, 4, 5]    # sample 1, group 1
                ]
            ]

    Returns:
        a list of Row instances sorted by (- row.group_id, row.confidence)

    References:
    [1] Alexander Strehl, Joydeep Ghosh, Claire Cardie (2002)
        "Cluster Ensembles - A Knowledge Reuse Framework
        for Combining Multiple Partitions"
        Journal of Machine Learning Research
        http://jmlr.csail.mit.edu/papers/volume3/strehl02a/strehl02a.pdf
    '''
    if not groupings:
        raise LoomError('tried to find consensus among zero groupings')

    # ------------------------------------------------------------------------
    # Set up consensus grouping problem

    allgroups = sum(groupings, [])
    objects = list(set(sum(allgroups, [])))
    objects.sort()
    index = {item: i for i, item in enumerate(objects)}

    vertices = [
        numpy.array(map(index.__getitem__, g), dtype=numpy.intp)
        for g in allgroups
    ]

    contains = numpy.zeros((len(vertices), len(objects)), dtype=numpy.float32)
    for v, vertex in enumerate(vertices):
        contains[v, vertex] = 1  # i.e. for u in vertex: contains[v, u] = i

    # We use the binary Jaccard measure for similarity
    overlap = numpy.dot(contains, contains.T)
    diag = overlap.diagonal()
    denom = (diag.reshape(len(vertices), 1) + diag.reshape(1, len(vertices)) -
             overlap)
    similarity = overlap / denom

    # ------------------------------------------------------------------------
    # Format for metis

    if not (similarity.max() <= 1):
        raise LoomError('similarity.max() = {}'.format(similarity.max()))
    similarity *= 2**16  # metis segfaults if this is too large
    int_similarity = numpy.zeros(similarity.shape, dtype=numpy.int32)
    int_similarity[:] = numpy.rint(similarity)

    edges = int_similarity.nonzero()
    edge_weights = map(int, int_similarity[edges])
    edges = numpy.transpose(edges)

    adjacency = [[] for _ in vertices]
    for i, j in edges:
        adjacency[i].append(j)

    # FIXME is there a better way to choose the final group count?
    group_count = int(numpy.median(map(len, groupings)))

    metis_args = {
        'nparts': group_count,
        'adjacency': adjacency,
        'eweights': edge_weights,
    }

    if debug:
        json_dump(metis_args, METIS_ARGS_TEMPFILE, indent=4)

    edge_cut, partition = pymetis.part_graph(**metis_args)

    if debug:
        os.remove(METIS_ARGS_TEMPFILE)

    # ------------------------------------------------------------------------
    # Clean up solution

    parts = range(group_count)
    if len(partition) != len(vertices):
        raise LoomError('metis output vector has wrong length')

    represents = numpy.zeros((len(parts), len(vertices)))
    for v, p in enumerate(partition):
        represents[p, v] = 1

    contains = numpy.dot(represents, contains)
    represent_counts = represents.sum(axis=1)
    represent_counts[numpy.where(represent_counts == 0)] = 1  # avoid NANs
    contains /= represent_counts.reshape(group_count, 1)

    bestmatch = contains.argmax(axis=0)
    confidence = contains[bestmatch, range(len(bestmatch))]
    if not all(numpy.isfinite(confidence)):
        raise LoomError('confidence is nan')

    nonempty_groups = list(set(bestmatch))
    nonempty_groups.sort()
    reindex = {j: i for i, j in enumerate(nonempty_groups)}

    grouping = [
        Row(row_id=objects[i], group_id=reindex[g], confidence=c)
        for i, (g, c) in enumerate(izip(bestmatch, confidence))
    ]

    groups = collate((row.group_id, row) for row in grouping)
    groups.sort(key=len, reverse=True)
    grouping = [
        Row(row_id=row.row_id, group_id=group_id, confidence=row.confidence)
        for group_id, group in enumerate(groups) for row in group
    ]
    grouping.sort(key=lambda x: (x.group_id, -x.confidence, x.row_id))

    return grouping