Beispiel #1
0
def make_fake_encoding(schema_in, model_in, encoding_out):
    '''
    Make a fake encoding from json schema + model.
    Assume that feature names in schema correspond to featureids in model
    e.g. schema was generated from loom.format.make_schema
    '''
    schema = json_load(schema_in)
    fields = []
    builders = []
    name_to_builder = {}
    for name, model in sorted(schema.iteritems()):
        fields.append(loom.schema.MODEL_TO_DATATYPE[model])
        Builder = FAKE_ENCODER_BUILDERS[model]
        builder = Builder(name, model)
        builders.append(builder)
        name_to_builder[name] = builder

    cross_cat = loom.schema_pb2.CrossCat()
    with open_compressed(model_in, 'rb') as f:
        cross_cat.ParseFromString(f.read())
    for kind in cross_cat.kinds:
        featureid = iter(kind.featureids)
        for model in loom.schema.MODELS.iterkeys():
            for shared in getattr(kind.product_model, model):
                feature_name = '{:06d}'.format(featureid.next())
                assert feature_name in schema
                if model == 'dd':
                    for i in range(len(shared.alphas)):
                        name_to_builder[feature_name].add_value(str(i))
                elif model == 'dpd':
                    for val in shared.values:
                        name_to_builder[feature_name].add_value(str(val))
    encoders = [b.build() for b in builders]
    ensure_fake_encoders_are_sorted(encoders)
    json_dump(encoders, encoding_out)
Beispiel #2
0
def load(name, schema, rows_csv):
    '''
    Load a csv dataset for testing and benchmarking.
    '''
    assert os.path.exists(schema)
    assert schema.endswith('.json')
    assert os.path.exists(rows_csv)
    if os.path.isfile(rows_csv):
        assert rows_csv.endswith('.csv') or rows_csv.endswith('.csv.gz')
    else:
        assert os.path.isdir(rows_csv)
    paths = loom.store.get_paths(name)
    assert not os.path.exists(paths['root']), 'dataset already loaded'
    json_dump(json_load(schema), paths['ingest']['schema'])
    loom.format.make_schema_row(
        schema_in=paths['ingest']['schema'],
        schema_row_out=paths['ingest']['schema_row'])
    if os.path.isdir(rows_csv):
        os.symlink(rows_csv, paths['ingest']['rows_csv'])
    else:
        os.makedirs(paths['ingest']['rows_csv'])
        os.symlink(
            rows_csv,
            os.path.join(
                paths['ingest']['rows_csv'],
                os.path.basename(rows_csv)))
Beispiel #3
0
def load(name, schema, rows_csv):
    '''
    Load a csv dataset for testing and benchmarking.
    '''
    assert os.path.exists(schema)
    assert schema.endswith('.json')
    assert os.path.exists(rows_csv)
    if os.path.isfile(rows_csv):
        assert rows_csv.endswith('.csv') or rows_csv.endswith('.csv.gz')
    else:
        assert os.path.isdir(rows_csv)
    paths = loom.store.get_paths(name)
    assert not os.path.exists(paths['root']), 'dataset already loaded'
    json_dump(json_load(schema), paths['ingest']['schema'])
    loom.format.make_schema_row(
        schema_in=paths['ingest']['schema'],
        schema_row_out=paths['ingest']['schema_row'])
    if os.path.isdir(rows_csv):
        os.symlink(rows_csv, paths['ingest']['rows_csv'])
    else:
        os.makedirs(paths['ingest']['rows_csv'])
        os.symlink(
            rows_csv,
            os.path.join(
                paths['ingest']['rows_csv'],
                os.path.basename(rows_csv)))
Beispiel #4
0
def make_fake_encoding(schema_in, model_in, encoding_out):
    '''
    Make a fake encoding from json schema + model.
    Assume that feature names in schema correspond to featureids in model
    e.g. schema was generated from loom.format.make_schema
    '''
    schema = json_load(schema_in)
    fields = []
    builders = []
    name_to_builder = {}
    for name, model in sorted(schema.iteritems()):
        fields.append(loom.schema.MODEL_TO_DATATYPE[model])
        Builder = FAKE_ENCODER_BUILDERS[model]
        builder = Builder(name, model)
        builders.append(builder)
        name_to_builder[name] = builder

    cross_cat = loom.schema_pb2.CrossCat()
    with open_compressed(model_in, 'rb') as f:
        cross_cat.ParseFromString(f.read())
    for kind in cross_cat.kinds:
        featureid = iter(kind.featureids)
        for model in loom.schema.MODELS.iterkeys():
            for shared in getattr(kind.product_model, model):
                feature_name = '{:06d}'.format(featureid.next())
                assert feature_name in schema
                if model == 'dd':
                    for i in range(len(shared.alphas)):
                        name_to_builder[feature_name].add_value(str(i))
                elif model == 'dpd':
                    for val in shared.values:
                        name_to_builder[feature_name].add_value(str(val))
    encoders = [b.build() for b in builders]
    ensure_fake_encoders_are_sorted(encoders)
    json_dump(encoders, encoding_out)
Beispiel #5
0
def _test_modify_schema(modify, name, schema, rows_csv, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR) as store:
        with mock.patch('loom.store.STORE', new=store):
            modified_schema = os.path.join(store, 'schema.json')
            data = json_load(schema)
            data = modify(data)
            json_dump(data, modified_schema)
            loom.tasks.ingest(name, modified_schema, rows_csv, debug=True)
Beispiel #6
0
def make_encoding(schema_in, rows_in, encoding_out):
    '''
    Make a row encoder from csv rows data + json schema.
    '''
    if os.path.isdir(rows_in):
        builders = _make_encoder_builders_dir(schema_in, rows_in)
    else:
        builders = _make_encoder_builders_file((schema_in, rows_in))
    encoders = [builder.build() for builder in builders]
    encoders.sort(key=get_encoder_rank)
    json_dump(encoders, encoding_out)
Beispiel #7
0
def make_encoding(schema_in, rows_in, encoding_out):
    '''
    Make a row encoder from csv rows data + json schema.
    '''
    if os.path.isdir(rows_in):
        builders = _make_encoder_builders_dir(schema_in, rows_in)
    else:
        builders = _make_encoder_builders_file((schema_in, rows_in))
    encoders = [builder.build() for builder in builders]
    encoders.sort(key=get_encoder_rank)
    json_dump(encoders, encoding_out)
Beispiel #8
0
def make_transforms(schema_in, rows_in, schema_out, transforms_out):
    fluent_schema = load_schema(schema_in)
    basic_schema = {}
    pre_transforms = []
    transforms = []
    builders = []
    dates = [
        feature_name
        for feature_name, fluent_type in fluent_schema.iteritems()
        if fluent_type.endswith('date')
    ]
    id_field = None
    for feature_name, fluent_type in fluent_schema.iteritems():
        # parse adjectives
        if fluent_type.startswith('optional_'):
            transform = PresenceTransform(feature_name)
            pre_transforms.append(transform)
            transforms.append(transform)
            fluent_type = fluent_type[len('optional_'):]
            feature_name = '{}.value'.format(feature_name)

        # parse nouns
        if fluent_type == 'id':
            id_field = feature_name
        elif fluent_type in ['categorical', 'unbounded_categorical']:
            transforms.append(StringTransform(feature_name, fluent_type))
        elif fluent_type == 'percent':
            transforms.append(PercentTransform(feature_name))
        elif fluent_type == 'sparse_real':
            transforms.append(SparseRealTransform(feature_name))
        elif fluent_type == 'text':
            builders.append(TextTransformBuilder(feature_name))
        elif fluent_type == 'tags':
            builders.append(
                TextTransformBuilder(feature_name, allow_empty=True))
        elif fluent_type == 'date':
            relatives = [other for other in dates if other < feature_name]
            transforms.append(DateTransform(feature_name, relatives))
        else:
            basic_type = FLUENT_TO_BASIC[fluent_type]
            basic_schema[feature_name] = basic_type
    if builders:
        transforms += build_transforms(rows_in, pre_transforms, builders)
    for transform in transforms:
        basic_schema.update(transform.get_schema())
    json_dump(basic_schema, schema_out)
    pickle_dump(transforms, transforms_out)
    LOG('transformed {} -> {} features'.format(
        len(fluent_schema),
        len(basic_schema)))
    return id_field
Beispiel #9
0
def crossvalidate_one(seed, test_count, train_count, inputs, results,
                      extra_passes, debug):
    LOG('running seed {}:'.format(seed))
    results['train'] = os.path.join(results['root'], 'train', 'diffs.pbs.gz')
    results['test'] = os.path.join(results['root'], 'test', 'rows.pbs.gz')
    results['scores'] = os.path.join(results['root'], 'scores.json.gz')

    config = {
        'seed': seed,
        'schedule': {
            'extra_passes': extra_passes
        },
    }
    loom.config.config_dump(config, results['samples'][0]['config'])

    numpy.random.seed(seed)
    split = [True] * train_count + [False] * test_count
    numpy.random.shuffle(split)
    diffs_in = protobuf_stream_load(inputs['ingest']['diffs'])
    protobuf_stream_dump((row for s, row in izip(split, diffs_in) if s),
                         results['train'])
    rows_in = protobuf_stream_load(inputs['ingest']['rows'])
    protobuf_stream_dump((row for s, row in izip(split, rows_in) if not s),
                         results['test'])

    LOG(' shuffle')
    loom.runner.shuffle(rows_in=results['train'],
                        rows_out=results['samples'][0]['shuffled'],
                        seed=seed,
                        debug=debug)
    LOG(' init')
    loom.generate.generate_init(encoding_in=inputs['ingest']['encoding'],
                                model_out=results['samples'][0]['init'],
                                seed=seed)
    LOG(' infer')
    loom.runner.infer(config_in=results['samples'][0]['config'],
                      rows_in=results['samples'][0]['shuffled'],
                      tares_in=inputs['ingest']['tares'],
                      model_in=results['samples'][0]['init'],
                      model_out=results['samples'][0]['model'],
                      groups_out=results['samples'][0]['groups'],
                      debug=debug)
    LOG(' query')
    rows = loom.query.load_data_rows(results['test'])
    loom.config.config_dump({}, results['query']['config'])
    with loom.query.get_server(results['root'], debug=debug) as query:
        scores = [query.score(row) for row in rows]

    json_dump(scores, results['scores'])
    LOG(' done\n')
    return numpy.mean(scores)
Beispiel #10
0
def make_transforms(schema_in, rows_in, schema_out, transforms_out):
    fluent_schema = load_schema(schema_in)
    basic_schema = {}
    pre_transforms = []
    transforms = []
    builders = []
    dates = [
        feature_name
        for feature_name, fluent_type in fluent_schema.iteritems()
        if fluent_type.endswith('date')
    ]
    id_field = None
    for feature_name, fluent_type in fluent_schema.iteritems():
        # parse adjectives
        if fluent_type.startswith('optional_'):
            transform = PresenceTransform(feature_name)
            pre_transforms.append(transform)
            transforms.append(transform)
            fluent_type = fluent_type[len('optional_'):]
            feature_name = '{}.value'.format(feature_name)

        # parse nouns
        if fluent_type == 'id':
            id_field = feature_name
        elif fluent_type in ['categorical', 'unbounded_categorical']:
            transforms.append(StringTransform(feature_name, fluent_type))
        elif fluent_type == 'percent':
            transforms.append(PercentTransform(feature_name))
        elif fluent_type == 'sparse_real':
            transforms.append(SparseRealTransform(feature_name))
        elif fluent_type == 'text':
            builders.append(TextTransformBuilder(feature_name))
        elif fluent_type == 'tags':
            builders.append(
                TextTransformBuilder(feature_name, allow_empty=True))
        elif fluent_type == 'date':
            relatives = [other for other in dates if other < feature_name]
            transforms.append(DateTransform(feature_name, relatives))
        else:
            basic_type = FLUENT_TO_BASIC[fluent_type]
            basic_schema[feature_name] = basic_type
    if builders:
        transforms += build_transforms(rows_in, pre_transforms, builders)
    for transform in transforms:
        basic_schema.update(transform.get_schema())
    json_dump(basic_schema, schema_out)
    pickle_dump(transforms, transforms_out)
    LOG('transformed {} -> {} features'.format(
        len(fluent_schema),
        len(basic_schema)))
    return id_field
Beispiel #11
0
def make_schema(model_in, schema_out):
    '''
    Make a schema from a protobuf model.
    '''
    cross_cat = loom.schema_pb2.CrossCat()
    with open_compressed(model_in, 'rb') as f:
        cross_cat.ParseFromString(f.read())
    schema = {}
    for kind in cross_cat.kinds:
        featureid = iter(kind.featureids)
        for model in loom.schema.MODELS.iterkeys():
            for shared in getattr(kind.product_model, model):
                feature_name = '{:06d}'.format(featureid.next())
                schema[feature_name] = model
    json_dump(schema, schema_out)
    return schema
Beispiel #12
0
def make_schema(model_in, schema_out):
    '''
    Make a schema from a protobuf model.
    '''
    cross_cat = loom.schema_pb2.CrossCat()
    with open_compressed(model_in, 'rb') as f:
        cross_cat.ParseFromString(f.read())
    schema = {}
    for kind in cross_cat.kinds:
        featureid = iter(kind.featureids)
        for model in loom.schema.MODELS.iterkeys():
            for shared in getattr(kind.product_model, model):
                feature_name = '{:06d}'.format(featureid.next())
                schema[feature_name] = model
    json_dump(schema, schema_out)
    return schema
Beispiel #13
0
def crossvalidate(
        name=None,
        sample_count=10,
        portion=0.9,
        extra_passes=loom.config.DEFAULTS['schedule']['extra_passes'],
        debug=False):
    '''
    Randomly split dataset; train models; score held-out data.
    '''
    assert 0 < portion and portion < 1, portion
    assert sample_count > 0, sample_count
    loom.store.require(name, [
        'ingest.encoding',
        'ingest.tares',
        'ingest.diffs',
    ])
    inputs = loom.store.get_paths(name)

    row_count = sum(1 for _ in protobuf_stream_load(inputs['ingest']['diffs']))
    assert row_count > 1, 'too few rows to crossvalidate: {}'.format(row_count)
    train_count = max(1, min(row_count - 1, int(round(portion * row_count))))
    test_count = row_count - train_count
    assert 1 <= train_count and 1 <= test_count

    mean_scores = []
    for seed in xrange(sample_count):
        results = loom.store.get_paths(
            os.path.join(name, 'crossvalidate/{}'.format(seed)))
        mean = crossvalidate_one(
            seed,
            test_count,
            train_count,
            inputs,
            results,
            extra_passes,
            debug)
        mean_scores.append(mean)

    results = loom.store.get_paths(os.path.join(name, 'crossvalidate'))
    results['scores'] = os.path.join(results['root'], 'scores.json.gz')
    json_dump(mean_scores, results['scores'])
    print 'score = {} +- {}'.format(
        numpy.mean(mean_scores),
        numpy.std(mean_scores))
Beispiel #14
0
def make_fake_encoding(schema_in, rows_in, encoding_out):
    '''
    Make a fake encoding from json schema + protobuf rows.
    '''
    schema = json_load(schema_in)
    fields = []
    builders = []
    for name, model in sorted(schema.iteritems()):
        fields.append(loom.schema.MODEL_TO_DATATYPE[model])
        Builder = FAKE_ENCODER_BUILDERS[model]
        builder = Builder(name, model)
        builders.append(builder)
    for row in loom.cFormat.row_stream_load(rows_in):
        data = row.iter_data()
        observeds = data['observed']
        for observed, field, builder in izip(observeds, fields, builders):
            if observed:
                builder.add_value(str(data[field].next()))
    encoders = [builder.build() for builder in builders]
    ensure_fake_encoders_are_sorted(encoders)
    json_dump(encoders, encoding_out)
Beispiel #15
0
def crossvalidate(
        name=None,
        sample_count=10,
        portion=0.9,
        extra_passes=loom.config.DEFAULTS['schedule']['extra_passes'],
        debug=False):
    '''
    Randomly split dataset; train models; score held-out data.
    '''
    assert 0 < portion and portion < 1, portion
    assert sample_count > 0, sample_count
    loom.store.require(name, [
        'ingest.encoding',
        'ingest.tares',
        'ingest.diffs',
    ])
    inputs = loom.store.get_paths(name)

    row_count = sum(1 for _ in protobuf_stream_load(inputs['ingest']['diffs']))
    assert row_count > 1, 'too few rows to crossvalidate: {}'.format(row_count)
    train_count = max(1, min(row_count - 1, int(round(portion * row_count))))
    test_count = row_count - train_count
    assert 1 <= train_count and 1 <= test_count

    mean_scores = []
    for seed in xrange(sample_count):
        results = loom.store.get_paths(
            os.path.join(name, 'crossvalidate/{}'.format(seed)))
        mean = crossvalidate_one(seed, test_count, train_count, inputs,
                                 results, extra_passes, debug)
        mean_scores.append(mean)

    results = loom.store.get_paths(os.path.join(name, 'crossvalidate'))
    results['scores'] = os.path.join(results['root'], 'scores.json.gz')
    json_dump(mean_scores, results['scores'])
    print 'score = {} +- {}'.format(numpy.mean(mean_scores),
                                    numpy.std(mean_scores))
Beispiel #16
0
def find_consensus_grouping(groupings, debug=False):
    '''
    This implements Strehl et al's Meta-Clustering Algorithm [1].

    Inputs:
        groupings - a list of lists of lists of object ids, for example

            [
                [                   # sample 0
                    [0, 1, 2],      # sample 0, group 0
                    [3, 4],         # sample 0, group 1
                    [5]             # sample 0, group 2
                ],
                [                   # sample 1
                    [0, 1],         # sample 1, group 0
                    [2, 3, 4, 5]    # sample 1, group 1
                ]
            ]

    Returns:
        a list of Row instances sorted by (- row.group_id, row.confidence)

    References:
    [1] Alexander Strehl, Joydeep Ghosh, Claire Cardie (2002)
        "Cluster Ensembles - A Knowledge Reuse Framework
        for Combining Multiple Partitions"
        Journal of Machine Learning Research
        http://jmlr.csail.mit.edu/papers/volume3/strehl02a/strehl02a.pdf
    '''
    if not groupings:
        raise LoomError('tried to find consensus among zero groupings')

    # ------------------------------------------------------------------------
    # Set up consensus grouping problem

    allgroups = sum(groupings, [])
    objects = list(set(sum(allgroups, [])))
    objects.sort()
    index = {item: i for i, item in enumerate(objects)}

    vertices = [
        numpy.array(map(index.__getitem__, g), dtype=numpy.intp)
        for g in allgroups
    ]

    contains = numpy.zeros((len(vertices), len(objects)), dtype=numpy.float32)
    for v, vertex in enumerate(vertices):
        contains[v, vertex] = 1  # i.e. for u in vertex: contains[v, u] = i

    # We use the binary Jaccard measure for similarity
    overlap = numpy.dot(contains, contains.T)
    diag = overlap.diagonal()
    denom = (diag.reshape(len(vertices), 1) + diag.reshape(1, len(vertices)) -
             overlap)
    similarity = overlap / denom

    # ------------------------------------------------------------------------
    # Format for metis

    if not (similarity.max() <= 1):
        raise LoomError('similarity.max() = {}'.format(similarity.max()))
    similarity *= 2**16  # metis segfaults if this is too large
    int_similarity = numpy.zeros(similarity.shape, dtype=numpy.int32)
    int_similarity[:] = numpy.rint(similarity)

    edges = int_similarity.nonzero()
    edge_weights = map(int, int_similarity[edges])
    edges = numpy.transpose(edges)

    adjacency = [[] for _ in vertices]
    for i, j in edges:
        adjacency[i].append(j)

    # FIXME is there a better way to choose the final group count?
    group_count = int(numpy.median(map(len, groupings)))

    metis_args = {
        'nparts': group_count,
        'adjacency': adjacency,
        'eweights': edge_weights,
    }

    if debug:
        json_dump(metis_args, METIS_ARGS_TEMPFILE, indent=4)

    edge_cut, partition = pymetis.part_graph(**metis_args)

    if debug:
        os.remove(METIS_ARGS_TEMPFILE)

    # ------------------------------------------------------------------------
    # Clean up solution

    parts = range(group_count)
    if len(partition) != len(vertices):
        raise LoomError('metis output vector has wrong length')

    represents = numpy.zeros((len(parts), len(vertices)))
    for v, p in enumerate(partition):
        represents[p, v] = 1

    contains = numpy.dot(represents, contains)
    represent_counts = represents.sum(axis=1)
    represent_counts[numpy.where(represent_counts == 0)] = 1  # avoid NANs
    contains /= represent_counts.reshape(group_count, 1)

    bestmatch = contains.argmax(axis=0)
    confidence = contains[bestmatch, range(len(bestmatch))]
    if not all(numpy.isfinite(confidence)):
        raise LoomError('confidence is nan')

    nonempty_groups = list(set(bestmatch))
    nonempty_groups.sort()
    reindex = {j: i for i, j in enumerate(nonempty_groups)}

    grouping = [
        Row(row_id=objects[i], group_id=reindex[g], confidence=c)
        for i, (g, c) in enumerate(izip(bestmatch, confidence))
    ]

    groups = collate((row.group_id, row) for row in grouping)
    groups.sort(key=len, reverse=True)
    grouping = [
        Row(row_id=row.row_id, group_id=group_id, confidence=row.confidence)
        for group_id, group in enumerate(groups) for row in group
    ]
    grouping.sort(key=lambda x: (x.group_id, -x.confidence, x.row_id))

    return grouping
Beispiel #17
0
def find_consensus_grouping(groupings, debug=False):
    '''
    This implements Strehl et al's Meta-Clustering Algorithm [1].

    Inputs:
        groupings - a list of lists of lists of object ids, for example

            [
                [                   # sample 0
                    [0, 1, 2],      # sample 0, group 0
                    [3, 4],         # sample 0, group 1
                    [5]             # sample 0, group 2
                ],
                [                   # sample 1
                    [0, 1],         # sample 1, group 0
                    [2, 3, 4, 5]    # sample 1, group 1
                ]
            ]

    Returns:
        a list of Row instances sorted by (- row.group_id, row.confidence)

    References:
    [1] Alexander Strehl, Joydeep Ghosh, Claire Cardie (2002)
        "Cluster Ensembles - A Knowledge Reuse Framework
        for Combining Multiple Partitions"
        Journal of Machine Learning Research
        http://jmlr.csail.mit.edu/papers/volume3/strehl02a/strehl02a.pdf
    '''
    if not groupings:
        raise LoomError('tried to find consensus among zero groupings')

    # ------------------------------------------------------------------------
    # Set up consensus grouping problem

    allgroups = sum(groupings, [])
    objects = list(set(sum(allgroups, [])))
    objects.sort()
    index = {item: i for i, item in enumerate(objects)}

    vertices = [numpy.array(map(index.__getitem__, g), dtype=numpy.intp)
                for g in allgroups]

    contains = numpy.zeros((len(vertices), len(objects)), dtype=numpy.float32)
    for v, vertex in enumerate(vertices):
        contains[v, vertex] = 1  # i.e. for u in vertex: contains[v, u] = i

    # We use the binary Jaccard measure for similarity
    overlap = numpy.dot(contains, contains.T)
    diag = overlap.diagonal()
    denom = (diag.reshape(len(vertices), 1) +
             diag.reshape(1, len(vertices)) - overlap)
    similarity = overlap / denom

    # ------------------------------------------------------------------------
    # Format for metis

    if not (similarity.max() <= 1):
        raise LoomError('similarity.max() = {}'.format(similarity.max()))
    similarity *= 2**16  # metis segfaults if this is too large
    int_similarity = numpy.zeros(similarity.shape, dtype=numpy.int32)
    int_similarity[:] = numpy.rint(similarity)

    edges = int_similarity.nonzero()
    edge_weights = map(int, int_similarity[edges])
    edges = numpy.transpose(edges)

    adjacency = [[] for _ in vertices]
    for i, j in edges:
        adjacency[i].append(j)

    # FIXME is there a better way to choose the final group count?
    group_count = int(numpy.median(map(len, groupings)))

    metis_args = {
        'nparts': group_count,
        'adjacency': adjacency,
        'eweights': edge_weights,
    }

    if debug:
        json_dump(metis_args, METIS_ARGS_TEMPFILE, indent=4)

    edge_cut, partition = pymetis.part_graph(**metis_args)

    if debug:
        os.remove(METIS_ARGS_TEMPFILE)

    # ------------------------------------------------------------------------
    # Clean up solution

    parts = range(group_count)
    if len(partition) != len(vertices):
        raise LoomError('metis output vector has wrong length')

    represents = numpy.zeros((len(parts), len(vertices)))
    for v, p in enumerate(partition):
        represents[p, v] = 1

    contains = numpy.dot(represents, contains)
    represent_counts = represents.sum(axis=1)
    represent_counts[numpy.where(represent_counts == 0)] = 1  # avoid NANs
    contains /= represent_counts.reshape(group_count, 1)

    bestmatch = contains.argmax(axis=0)
    confidence = contains[bestmatch, range(len(bestmatch))]
    if not all(numpy.isfinite(confidence)):
        raise LoomError('confidence is nan')

    nonempty_groups = list(set(bestmatch))
    nonempty_groups.sort()
    reindex = {j: i for i, j in enumerate(nonempty_groups)}

    grouping = [
        Row(row_id=objects[i], group_id=reindex[g], confidence=c)
        for i, (g, c) in enumerate(izip(bestmatch, confidence))
    ]

    groups = collate((row.group_id, row) for row in grouping)
    groups.sort(key=len, reverse=True)
    grouping = [
        Row(row_id=row.row_id, group_id=group_id, confidence=row.confidence)
        for group_id, group in enumerate(groups)
        for row in group
    ]
    grouping.sort(key=lambda x: (x.group_id, -x.confidence, x.row_id))

    return grouping
Beispiel #18
0
def crossvalidate_one(
        seed,
        test_count,
        train_count,
        inputs,
        results,
        extra_passes,
        debug):
    LOG('running seed {}:'.format(seed))
    results['train'] = os.path.join(
        results['root'],
        'train',
        'diffs.pbs.gz')
    results['test'] = os.path.join(results['root'], 'test', 'rows.pbs.gz')
    results['scores'] = os.path.join(results['root'], 'scores.json.gz')

    config = {
        'seed': seed,
        'schedule': {'extra_passes': extra_passes},
    }
    loom.config.config_dump(config, results['samples'][0]['config'])

    numpy.random.seed(seed)
    split = [True] * train_count + [False] * test_count
    numpy.random.shuffle(split)
    diffs_in = protobuf_stream_load(inputs['ingest']['diffs'])
    protobuf_stream_dump(
        (row for s, row in izip(split, diffs_in) if s),
        results['train'])
    rows_in = protobuf_stream_load(inputs['ingest']['rows'])
    protobuf_stream_dump(
        (row for s, row in izip(split, rows_in) if not s),
        results['test'])

    LOG(' shuffle')
    loom.runner.shuffle(
        rows_in=results['train'],
        rows_out=results['samples'][0]['shuffled'],
        seed=seed,
        debug=debug)
    LOG(' init')
    loom.generate.generate_init(
        encoding_in=inputs['ingest']['encoding'],
        model_out=results['samples'][0]['init'],
        seed=seed)
    LOG(' infer')
    loom.runner.infer(
        config_in=results['samples'][0]['config'],
        rows_in=results['samples'][0]['shuffled'],
        tares_in=inputs['ingest']['tares'],
        model_in=results['samples'][0]['init'],
        model_out=results['samples'][0]['model'],
        groups_out=results['samples'][0]['groups'],
        debug=debug)
    LOG(' query')
    rows = loom.query.load_data_rows(results['test'])
    loom.config.config_dump({}, results['query']['config'])
    with loom.query.get_server(results['root'], debug=debug) as query:
        scores = [query.score(row) for row in rows]

    json_dump(scores, results['scores'])
    LOG(' done\n')
    return numpy.mean(scores)