Example #1
0
def infer_checkpoint(name=None, period_sec=0, debug=False, profile='time'):
    '''
    Run inference from checkpoint, or list available checkpoints.
    '''
    if name is None:
        list_options_and_exit(CHECKPOINTS)

    rows = ROWS.format(name)
    model = MODEL.format(name)
    checkpoint = CHECKPOINTS.format(name)
    assert os.path.exists(rows), 'First load dataset'
    assert os.path.exists(model), 'First load dataset'
    assert os.path.exists(checkpoint), 'First load checkpoint'

    destin = os.path.join(RESULTS, name)
    mkdir_p(destin)

    config = {'schedule': {'checkpoint_period_sec': period_sec}}
    config_in = os.path.join(destin, 'config.pb.gz')
    loom.config.config_dump(config, config_in)

    kwargs = {'debug': debug, 'profile': profile}
    kwargs.update(checkpoint_files(checkpoint, '_in'))

    loom.runner.infer(config_in=config_in, rows_in=rows, **kwargs)
Example #2
0
def test(sample_count=2, force=True, debug=False):
    '''
    Generate small synthetic datasets for testing.
    '''
    mkdir_p(loom.store.STORE)
    configs = sorted(TEST_CONFIGS, key=(lambda c: -get_cost(CONFIGS[c])))
    parallel_map(generate_one,
                 [(name, sample_count, force, debug) for name in configs])
Example #3
0
def test(sample_count=2, force=True, debug=False):
    '''
    Generate small synthetic datasets for testing.
    '''
    mkdir_p(loom.store.STORE)
    configs = sorted(TEST_CONFIGS, key=(lambda c: -get_cost(CONFIGS[c])))
    parallel_map(generate_one, [
        (name, sample_count, force, debug) for name in configs
    ])
Example #4
0
def load_one(name):
    dataset = os.path.join(DATASETS, name)
    mkdir_p(dataset)
    init_out = INIT.format(name)
    rows_out = ROWS.format(name)
    model_out = MODEL.format(name)
    groups_out = GROUPS.format(name)
    if not all(os.path.exists(f) for f in [rows_out, model_out, groups_out]):
        print 'generating', name
        config = CONFIGS[name]
        loom.generate.generate(
            init_out=init_out,
            rows_out=rows_out,
            model_out=model_out,
            groups_out=groups_out,
            **config)
Example #5
0
def infer(
        name=None,
        extra_passes=loom.config.DEFAULTS['schedule']['extra_passes'],
        debug=False,
        profile='time'):
    '''
    Run inference on a dataset, or list available datasets.
    '''
    if name is None:
        list_options_and_exit(ROWS)

    init = INIT.format(name)
    rows = ROWS.format(name)
    assert os.path.exists(init), 'First load dataset'
    assert os.path.exists(rows), 'First load dataset'
    assert extra_passes > 0, 'cannot initialize with extra_passes = 0'

    destin = os.path.join(RESULTS, name)
    mkdir_p(destin)
    groups_out = os.path.join(destin, 'groups')
    mkdir_p(groups_out)

    config = {'schedule': {'extra_passes': extra_passes}}
    config_in = os.path.join(destin, 'config.pb.gz')
    loom.config.config_dump(config, config_in)

    loom.runner.infer(
        config_in=config_in,
        rows_in=rows,
        model_in=init,
        groups_out=groups_out,
        debug=debug,
        profile=profile)

    assert os.listdir(groups_out), 'no groups were written'
    group_counts = []
    for f in os.listdir(groups_out):
        group_count = 0
        for _ in protobuf_stream_load(os.path.join(groups_out, f)):
            group_count += 1
        group_counts.append(group_count)
    print 'group_counts: {}'.format(' '.join(map(str, group_counts)))
Example #6
0
def shuffle(name=None, debug=False, profile='time'):
    '''
    Shuffle dataset for inference.
    '''
    if name is None:
        list_options_and_exit(ROWS)

    rows_in = ROWS.format(name)
    assert os.path.exists(rows_in), 'First load dataset'

    destin = os.path.join(RESULTS, name)
    mkdir_p(destin)
    rows_out = os.path.join(destin, 'rows.pbs.gz')

    loom.runner.shuffle(
        rows_in=rows_in,
        rows_out=rows_out,
        debug=debug,
        profile=profile)
    assert os.path.exists(rows_out)
Example #7
0
def download(s3_url=S3_URL):
    '''
    Download dataset from S3 and load into loom.benchmark jig.
    '''
    import boto
    bucket, path = s3_split(s3_url)
    conn = boto.connect_s3().get_bucket(bucket)
    keys = [
        key.name for key in conn.list(path)
        if re.match(r'.*\d\d\d\.csv\.gz$', key.name)
    ]
    assert keys, 'nothing to download'
    files = [os.path.join(ROWS_CSV, os.path.basename(key)) for key in keys]
    tasks = [(bucket, source, destin) for source, destin in izip(keys, files)
             if not os.path.exists(destin)]
    if tasks:
        print 'starting download of {} files'.format(len(tasks))
        mkdir_p(ROWS_CSV)
        parallel_map(s3_get, tasks)
        print 'finished download of {} files'.format(len(keys))
Example #8
0
def download(s3_url=S3_URL):
    '''
    Download dataset from S3 and load into loom.benchmark jig.
    '''
    import boto
    bucket, path = s3_split(s3_url)
    conn = boto.connect_s3().get_bucket(bucket)
    keys = [
        key.name
        for key in conn.list(path)
        if re.match(r'.*\d\d\d\.csv\.gz$', key.name)
    ]
    assert keys, 'nothing to download'
    files = [os.path.join(ROWS_CSV, os.path.basename(key)) for key in keys]
    tasks = [
        (bucket, source, destin)
        for source, destin in izip(keys, files)
        if not os.path.exists(destin)
    ]
    if tasks:
        print 'starting download of {} files'.format(len(tasks))
        mkdir_p(ROWS_CSV)
        parallel_map(s3_get, tasks)
        print 'finished download of {} files'.format(len(keys))
Example #9
0
def load_checkpoint(name=None, period_sec=5, debug=False):
    '''
    Grab last full checkpoint for profiling, or list available datasets.
    '''
    loom.store.require(name, ['samples.0.init', 'samples.0.shuffled'])
    inputs, results = get_paths(name, 'checkpoints')

    rm_rf(results['root'])
    mkdir_p(results['root'])
    with chdir(results['root']):

        config = {'schedule': {'checkpoint_period_sec': period_sec}}
        loom.config.config_dump(config, results['samples'][0]['config'])

        # run first iteration
        step = 0
        mkdir_p(str(step))
        kwargs = checkpoint_files(step, '_out')
        print 'running checkpoint {}, tardis_iter 0'.format(step)
        loom.runner.infer(
            config_in=results['samples'][0]['config'],
            rows_in=inputs['samples'][0]['shuffled'],
            tares_in=inputs['ingest']['tares'],
            model_in=inputs['samples'][0]['init'],
            log_out=results['samples'][0]['infer_log'],
            debug=debug,
            **kwargs)
        checkpoint = _load_checkpoint(step)

        # find penultimate checkpoint
        while not checkpoint.finished:
            rm_rf(str(step - 3))
            step += 1
            print 'running checkpoint {}, tardis_iter {}'.format(
                step,
                checkpoint.tardis_iter)
            kwargs = checkpoint_files(step - 1, '_in')
            mkdir_p(str(step))
            kwargs.update(checkpoint_files(step, '_out'))
            loom.runner.infer(
                config_in=results['samples'][0]['config'],
                rows_in=inputs['samples'][0]['shuffled'],
                tares_in=inputs['ingest']['tares'],
                log_out=results['samples'][0]['infer_log'],
                debug=debug,
                **kwargs)
            checkpoint = _load_checkpoint(step)

        print 'final checkpoint {}, tardis_iter {}'.format(
            step,
            checkpoint.tardis_iter)

        last_full = str(step - 2)
        assert os.path.exists(last_full), 'too few checkpoints'
        checkpoint = _load_checkpoint(step)
        print 'saving checkpoint {}, tardis_iter {}'.format(
            last_full,
            checkpoint.tardis_iter)
        for f in checkpoint_files(last_full).values():
            shutil.move(f, results['root'])
        for f in glob.glob(os.path.join(results['root'], '[0-9]*/')):
            shutil.rmtree(f)
Example #10
0
def generate_one((name, sample_count, force, debug)):
    paths = loom.store.get_paths(name, sample_count=sample_count)
    if not force and all(os.path.exists(f) for f in paths.itervalues()):
        with open_compressed(paths['ingest']['version']) as f:
            version = f.read().strip()
        if version == loom.__version__:
            return
    print 'generating', name
    mkdir_p(paths['root'])
    with open_compressed(paths['ingest']['version'], 'w') as f:
        f.write(loom.__version__)
    config = CONFIGS[name]
    chunk_size = max(10, (config['row_count'] + 7) / 8)
    loom.transforms.make_fake_transforms(
        transforms_out=paths['ingest']['transforms'])
    loom.generate.generate(
        init_out=paths['samples'][0]['init'],
        rows_out=paths['ingest']['rows'],
        model_out=paths['samples'][0]['model'],
        groups_out=paths['samples'][0]['groups'],
        assign_out=paths['samples'][0]['assign'],
        **config)
    loom.format.make_schema(
        model_in=paths['samples'][0]['model'],
        schema_out=paths['ingest']['schema'])
    loom.format.make_fake_encoding(
        schema_in=paths['ingest']['schema'],
        model_in=paths['samples'][0]['model'],
        encoding_out=paths['ingest']['encoding'])
    loom.format.make_schema_row(
        schema_in=paths['ingest']['schema'],
        schema_row_out=paths['ingest']['schema_row'])
    loom.runner.tare(
        schema_row_in=paths['ingest']['schema_row'],
        rows_in=paths['ingest']['rows'],
        tares_out=paths['ingest']['tares'],
        debug=debug)
    loom.runner.sparsify(
        schema_row_in=paths['ingest']['schema_row'],
        tares_in=paths['ingest']['tares'],
        rows_in=paths['ingest']['rows'],
        rows_out=paths['ingest']['diffs'],
        debug=debug)
    loom.format.export_rows(
        encoding_in=paths['ingest']['encoding'],
        rows_in=paths['ingest']['rows'],
        rows_csv_out=paths['ingest']['rows_csv'],
        chunk_size=chunk_size)
    loom.format.import_rowids(
        rows_csv_in=paths['ingest']['rows_csv'],
        rowids_out=paths['ingest']['rowids'],
        id_field='_id')
    protobuf_stream_dump([], paths['query']['query_log'])
    loom.config.config_dump({}, paths['query']['config'])
    for seed, sample in enumerate(paths['samples']):
        loom.config.config_dump({'seed': seed}, sample['config'])
        loom.generate.generate_init(
            encoding_in=paths['ingest']['encoding'],
            model_out=sample['init'],
            seed=seed)
        loom.runner.shuffle(
            rows_in=paths['ingest']['diffs'],
            rows_out=sample['shuffled'],
            seed=seed,
            debug=debug)
        protobuf_stream_dump([], sample['infer_log'])
    sample0 = paths['samples'][0]
    for seed, sample in enumerate(paths['samples'][1:]):
        if LOOM_DEBUG_MIX:
            cp_ns(sample0['model'], sample['model'])
            cp_ns(sample0['groups'], sample['groups'])
            cp_ns(sample0['assign'], sample['assign'])
        else:
            loom.runner.mix(
                config_in=sample['config'],
                rows_in=paths['ingest']['rows'],
                model_in=sample0['model'],
                groups_in=sample0['groups'],
                assign_in=sample0['assign'],
                model_out=sample['model'],
                groups_out=sample['groups'],
                assign_out=sample['assign'],
                debug=debug)
    loom.consensus.make_fake_consensus(
        paths=paths,
        debug=debug)
Example #11
0
def load_checkpoint(name=None, period_sec=5, debug=False):
    '''
    Grab last full checkpoint for profiling, or list available datasets.
    '''
    if name is None:
        list_options_and_exit(MODEL)

    rows = ROWS.format(name)
    model = MODEL.format(name)
    assert os.path.exists(model), 'First load dataset'
    assert os.path.exists(rows), 'First load dataset'

    destin = CHECKPOINTS.format(name)
    rm_rf(destin)
    mkdir_p(os.path.dirname(destin))

    def load_checkpoint(name):
        checkpoint = loom.schema_pb2.Checkpoint()
        with open_compressed(checkpoint_files(name)['checkpoint']) as f:
            checkpoint.ParseFromString(f.read())
        return checkpoint

    with tempdir(cleanup_on_error=(not debug)):

        config = {'schedule': {'checkpoint_period_sec': period_sec}}
        config_in = os.path.abspath('config.pb.gz')
        loom.config.config_dump(config, config_in)

        # run first iteration
        step = 0
        mkdir_p(str(step))
        kwargs = checkpoint_files(str(step), '_out')
        print 'running checkpoint {}, tardis_iter 0'.format(step)
        loom.runner.infer(
            config_in=config_in,
            rows_in=rows,
            model_in=model,
            debug=debug,
            **kwargs)
        checkpoint = load_checkpoint(step)

        # find penultimate checkpoint
        while not checkpoint.finished:
            rm_rf(str(step - 3))
            step += 1
            print 'running checkpoint {}, tarids_iter {}'.format(
                step,
                checkpoint.tardis_iter)
            kwargs = checkpoint_files(step - 1, '_in')
            mkdir_p(str(step))
            kwargs.update(checkpoint_files(step, '_out'))
            loom.runner.infer(
                config_in=config_in,
                rows_in=rows,
                debug=debug,
                **kwargs)
            checkpoint = load_checkpoint(step)

        print 'final checkpoint {}, tardis_iter {}'.format(
            step,
            checkpoint.tardis_iter)

        last_full = str(step - 2)
        assert os.path.exists(last_full), 'too few checkpoints'
        checkpoint = load_checkpoint(step)
        print 'saving checkpoint {}, tardis_iter {}'.format(
            last_full,
            checkpoint.tardis_iter)
        shutil.move(last_full, destin)
Example #12
0
def generate_one((name, sample_count, force, debug)):
    paths = loom.store.get_paths(name, sample_count=sample_count)
    if not force and all(os.path.exists(f) for f in paths.itervalues()):
        with open_compressed(paths['ingest']['version']) as f:
            version = f.read().strip()
        if version == loom.__version__:
            return
    print 'generating', name
    mkdir_p(paths['root'])
    with open_compressed(paths['ingest']['version'], 'w') as f:
        f.write(loom.__version__)
    config = CONFIGS[name]
    chunk_size = max(10, (config['row_count'] + 7) / 8)
    loom.transforms.make_fake_transforms(
        transforms_out=paths['ingest']['transforms'])
    loom.generate.generate(
        init_out=paths['samples'][0]['init'],
        rows_out=paths['ingest']['rows'],
        model_out=paths['samples'][0]['model'],
        groups_out=paths['samples'][0]['groups'],
        assign_out=paths['samples'][0]['assign'],
        **config)
    loom.format.make_schema(
        model_in=paths['samples'][0]['model'],
        schema_out=paths['ingest']['schema'])
    loom.format.make_fake_encoding(
        schema_in=paths['ingest']['schema'],
        model_in=paths['samples'][0]['model'],
        encoding_out=paths['ingest']['encoding'])
    loom.format.make_schema_row(
        schema_in=paths['ingest']['schema'],
        schema_row_out=paths['ingest']['schema_row'])
    loom.runner.tare(
        schema_row_in=paths['ingest']['schema_row'],
        rows_in=paths['ingest']['rows'],
        tares_out=paths['ingest']['tares'],
        debug=debug)
    loom.runner.sparsify(
        schema_row_in=paths['ingest']['schema_row'],
        tares_in=paths['ingest']['tares'],
        rows_in=paths['ingest']['rows'],
        rows_out=paths['ingest']['diffs'],
        debug=debug)
    loom.format.export_rows(
        encoding_in=paths['ingest']['encoding'],
        rows_in=paths['ingest']['rows'],
        rows_csv_out=paths['ingest']['rows_csv'],
        chunk_size=chunk_size)
    loom.format.import_rowids(
        rows_csv_in=paths['ingest']['rows_csv'],
        rowids_out=paths['ingest']['rowids'],
        id_field='_id')
    protobuf_stream_dump([], paths['query']['query_log'])
    loom.config.config_dump({}, paths['query']['config'])
    for seed, sample in enumerate(paths['samples']):
        loom.config.config_dump({'seed': seed}, sample['config'])
        loom.generate.generate_init(
            encoding_in=paths['ingest']['encoding'],
            model_out=sample['init'],
            seed=seed)
        loom.runner.shuffle(
            rows_in=paths['ingest']['diffs'],
            rows_out=sample['shuffled'],
            seed=seed,
            debug=debug)
        protobuf_stream_dump([], sample['infer_log'])
    sample0 = paths['samples'][0]
    for seed, sample in enumerate(paths['samples'][1:]):
        if LOOM_DEBUG_MIX:
            cp_ns(sample0['model'], sample['model'])
            cp_ns(sample0['groups'], sample['groups'])
            cp_ns(sample0['assign'], sample['assign'])
        else:
            loom.runner.mix(
                config_in=sample['config'],
                rows_in=paths['ingest']['rows'],
                model_in=sample0['model'],
                groups_in=sample0['groups'],
                assign_in=sample0['assign'],
                model_out=sample['model'],
                groups_out=sample['groups'],
                assign_out=sample['assign'],
                debug=debug)
    loom.consensus.make_fake_consensus(
        paths=paths,
        debug=debug)