Esempio n. 1
0
def make_consensus(name, config=None, debug=False):
    '''
    Combine samples into a single consensus sample.
    Arguments:
        name            A unique identifier for consensus
        config          An optional json config file
                            currently doesn't do anything but will be used to
                            support e.g. cluster coarseness in the future
        debug           Whether to run debug versions of C++ code
    Environment varibles:
        LOOM_VERBOSITY  Verbosity level
    '''
    paths = loom.store.get_paths(name)
    LOG('making config')
    if config is None:
        config = {}
    elif isinstance(config, basestring):
        if not os.path.exists(config):
            raise LoomError('Missing config file: {}'.format(config))
        config = json_load(config)
    else:
        config = copy.deepcopy(config)
    loom.config.config_dump(config, paths['samples'][0]['config'])

    LOG('finding consensus')
    loom.consensus.make_consensus(paths=paths, debug=debug)
Esempio n. 2
0
def infer_one(name, seed=0, config=None, debug=False):
    '''
    Infer a single sample.
    Arguments:
        name            A unique identifier for ingest + inference
        seed            The seed, i.e., sample number typically 0-9
        config          An optional json config file, e.g.,
                            {"schedule": {"extra_passes": 500.0}}
        debug           Whether to run debug versions of C++ code
    Environment variables:
        LOOM_VERBOSITY  Verbosity level
    '''
    paths = loom.store.get_paths(name, sample_count=(1 + seed))
    sample = paths['samples'][seed]

    LOG('making config')
    if config is None:
        config = {}
    elif isinstance(config, basestring):
        if not os.path.exists(config):
            raise LoomError('Missing config file: {}'.format(config))
        config = json_load(config)
    else:
        config = copy.deepcopy(config)
    if 'seed' not in config:
        config['seed'] = seed
    loom.config.config_dump(config, sample['config'])

    LOG('generating init')
    loom.generate.generate_init(
        encoding_in=paths['ingest']['encoding'],
        model_out=sample['init'],
        seed=seed)

    LOG('shuffling rows')
    loom.runner.shuffle(
        rows_in=paths['ingest']['diffs'],
        rows_out=sample['shuffled'],
        seed=seed,
        debug=debug)

    LOG('inferring, watch {}'.format(sample['infer_log']))
    loom.runner.infer(
        config_in=sample['config'],
        rows_in=sample['shuffled'],
        tares_in=paths['ingest']['tares'],
        model_in=sample['init'],
        model_out=sample['model'],
        groups_out=sample['groups'],
        assign_out=sample['assign'],
        log_out=sample['infer_log'],
        debug=debug)
Esempio n. 3
0
def transform(
        name,
        schema_csv='schema.csv',
        rows_csv='rows.csv.gz'):
    '''
    Transform dataset from fluent format to loom internal format.
    Arguments:
        name            A unique identifier for ingest + inference
        schema_csv      Schema file with columns [feature_name, datatype], e.g.
                            Feature Name,Type
                            full name,id
                            start date,optional_date
                            age,real
                            zipcode,unbounded_categorical
                            description,text
                        Loom assumes the first line is a header and ignores it.
                        Features without datatypes are ignored.
        rows_csv        File or directory of csv files or csv.gz files
    Environment variables:
        LOOM_THREADS    Number of concurrent ingest tasks
        LOOM_VERBOSITY  Verbosity level
    '''
    if not os.path.exists(schema_csv):
        raise LoomError('Missing schema_csv file: {}'.format(schema_csv))
    if not os.path.exists(rows_csv):
        raise LoomError('Missing rows_csv file: {}'.format(rows_csv))

    paths = loom.store.get_paths(name)

    LOG('making transforms')
    id_field = loom.transforms.make_transforms(
        schema_in=schema_csv,
        rows_in=rows_csv,
        schema_out=paths['ingest']['schema'],
        transforms_out=paths['ingest']['transforms'])

    LOG('transforming rows')
    loom.transforms.transform_rows(
        schema_in=paths['ingest']['schema'],
        transforms_in=paths['ingest']['transforms'],
        rows_in=rows_csv,
        rows_out=paths['ingest']['rows_csv'],
        id_field=id_field)
Esempio n. 4
0
def ingest(name, schema=None, rows_csv=None, id_field=None, debug=False):
    '''
    Ingest dataset with optional json config.
    Arguments:
        name            A unique identifier for ingest + inference
        schema          Json schema file, e.g., {"feature1": "nich"}
        rows_csv        File or directory of csv files or csv.gz files
        id_field        Column name of id field in input csv
        debug           Whether to run debug versions of C++ code
    Environment variables:
        LOOM_THREADS    Number of concurrent ingest tasks
        LOOM_VERBOSITY  Verbosity level
    '''
    paths = loom.store.get_paths(name)
    if schema is None:
        schema = paths['ingest']['schema']
    if rows_csv is None:
        rows_csv = paths['ingest']['rows_csv']
    if not os.path.exists(schema):
        raise LoomError('Missing schema file: {}'.format(schema))
    if not os.path.exists(rows_csv):
        raise LoomError('Missing rows_csv file: {}'.format(rows_csv))

    with open_compressed(paths['ingest']['version'], 'w') as f:
        f.write(loom.__version__)

    LOG('making schema row')
    loom.format.make_schema_row(schema_in=schema,
                                schema_row_out=paths['ingest']['schema_row'])

    LOG('making encoding')
    loom.format.make_encoding(schema_in=schema,
                              rows_in=rows_csv,
                              encoding_out=paths['ingest']['encoding'])

    LOG('importing rows')
    loom.format.import_rows(encoding_in=paths['ingest']['encoding'],
                            rows_csv_in=rows_csv,
                            rows_out=paths['ingest']['rows'])

    LOG('importing rowids')
    loom.format.import_rowids(rows_csv_in=rows_csv,
                              rowids_out=paths['ingest']['rowids'],
                              id_field=id_field)

    LOG('making tare rows')
    loom.runner.tare(schema_row_in=paths['ingest']['schema_row'],
                     rows_in=paths['ingest']['rows'],
                     tares_out=paths['ingest']['tares'],
                     debug=debug)

    tare_count = sum(1 for _ in protobuf_stream_load(paths['ingest']['tares']))
    LOG('sparsifying rows WRT {} tare rows'.format(tare_count))
    loom.runner.sparsify(schema_row_in=paths['ingest']['schema_row'],
                         tares_in=paths['ingest']['tares'],
                         rows_in=paths['ingest']['rows'],
                         rows_out=paths['ingest']['diffs'],
                         debug=debug)
    loom.config.config_dump({}, paths['query']['config'])
Esempio n. 5
0
def make_transforms(schema_in, rows_in, schema_out, transforms_out):
    fluent_schema = load_schema(schema_in)
    basic_schema = {}
    pre_transforms = []
    transforms = []
    builders = []
    dates = [
        feature_name
        for feature_name, fluent_type in fluent_schema.iteritems()
        if fluent_type.endswith('date')
    ]
    id_field = None
    for feature_name, fluent_type in fluent_schema.iteritems():
        # parse adjectives
        if fluent_type.startswith('optional_'):
            transform = PresenceTransform(feature_name)
            pre_transforms.append(transform)
            transforms.append(transform)
            fluent_type = fluent_type[len('optional_'):]
            feature_name = '{}.value'.format(feature_name)

        # parse nouns
        if fluent_type == 'id':
            id_field = feature_name
        elif fluent_type in ['categorical', 'unbounded_categorical']:
            transforms.append(StringTransform(feature_name, fluent_type))
        elif fluent_type == 'percent':
            transforms.append(PercentTransform(feature_name))
        elif fluent_type == 'sparse_real':
            transforms.append(SparseRealTransform(feature_name))
        elif fluent_type == 'text':
            builders.append(TextTransformBuilder(feature_name))
        elif fluent_type == 'tags':
            builders.append(
                TextTransformBuilder(feature_name, allow_empty=True))
        elif fluent_type == 'date':
            relatives = [other for other in dates if other < feature_name]
            transforms.append(DateTransform(feature_name, relatives))
        else:
            basic_type = FLUENT_TO_BASIC[fluent_type]
            basic_schema[feature_name] = basic_type
    if builders:
        transforms += build_transforms(rows_in, pre_transforms, builders)
    for transform in transforms:
        basic_schema.update(transform.get_schema())
    json_dump(basic_schema, schema_out)
    pickle_dump(transforms, transforms_out)
    LOG('transformed {} -> {} features'.format(
        len(fluent_schema),
        len(basic_schema)))
    return id_field
Esempio n. 6
0
def query(name, config=None, debug=False, profile=None):
    '''
    Start the query server.
    Arguments:
        name            A unique identifier for ingest + inference
        config          An optional json config file
        debug           Whether to run debug versions of C++ code
    Environment varibles:
        LOOM_VERBOSITY  Verbosity level
    '''
    paths = loom.store.get_paths(name)
    LOG('starting query server')
    server = loom.preql.get_server(paths['root'],
                                   paths['ingest']['encoding'],
                                   config=config,
                                   debug=debug,
                                   profile=profile)
    return server
Esempio n. 7
0
def crossvalidate_one(seed, test_count, train_count, inputs, results,
                      extra_passes, debug):
    LOG('running seed {}:'.format(seed))
    results['train'] = os.path.join(results['root'], 'train', 'diffs.pbs.gz')
    results['test'] = os.path.join(results['root'], 'test', 'rows.pbs.gz')
    results['scores'] = os.path.join(results['root'], 'scores.json.gz')

    config = {
        'seed': seed,
        'schedule': {
            'extra_passes': extra_passes
        },
    }
    loom.config.config_dump(config, results['samples'][0]['config'])

    numpy.random.seed(seed)
    split = [True] * train_count + [False] * test_count
    numpy.random.shuffle(split)
    diffs_in = protobuf_stream_load(inputs['ingest']['diffs'])
    protobuf_stream_dump((row for s, row in izip(split, diffs_in) if s),
                         results['train'])
    rows_in = protobuf_stream_load(inputs['ingest']['rows'])
    protobuf_stream_dump((row for s, row in izip(split, rows_in) if not s),
                         results['test'])

    LOG(' shuffle')
    loom.runner.shuffle(rows_in=results['train'],
                        rows_out=results['samples'][0]['shuffled'],
                        seed=seed,
                        debug=debug)
    LOG(' init')
    loom.generate.generate_init(encoding_in=inputs['ingest']['encoding'],
                                model_out=results['samples'][0]['init'],
                                seed=seed)
    LOG(' infer')
    loom.runner.infer(config_in=results['samples'][0]['config'],
                      rows_in=results['samples'][0]['shuffled'],
                      tares_in=inputs['ingest']['tares'],
                      model_in=results['samples'][0]['init'],
                      model_out=results['samples'][0]['model'],
                      groups_out=results['samples'][0]['groups'],
                      debug=debug)
    LOG(' query')
    rows = loom.query.load_data_rows(results['test'])
    loom.config.config_dump({}, results['query']['config'])
    with loom.query.get_server(results['root'], debug=debug) as query:
        scores = [query.score(row) for row in rows]

    json_dump(scores, results['scores'])
    LOG(' done\n')
    return numpy.mean(scores)
Esempio n. 8
0
def test_all(name, schema, rows_csv, **unused):
    name = os.path.join(name, 'test_tasks')
    paths = loom.store.get_paths(name)
    loom.datasets.clean(name)
    loom.tasks.ingest(name, schema, rows_csv, debug=True)
    loom.tasks.infer(
        name,
        sample_count=SAMPLE_COUNT,
        config=CONFIG,
        debug=True)
    loom.tasks.make_consensus(name, debug=True)

    LOG('querying')
    requests = get_example_requests(
        paths['samples'][0]['model'],
        paths['ingest']['rows'])
    with loom.tasks.query(paths['root'], debug=True) as server:
        pbserver = server._query_server.protobuf_server
        for request in requests:
            pbserver.send(request)
            response = pbserver.receive()
            check_response(request, response)