Beispiel #1
0
def init(sample_count=1, force=False, debug=False):
    '''
    Generate synthetic datasets for testing and benchmarking.
    '''
    configs = sorted(CONFIGS.keys(), key=(lambda c: -get_cost(CONFIGS[c])))
    parallel_map(generate_one,
                 [(name, sample_count, force, debug) for name in configs])
Beispiel #2
0
def init(sample_count=1, force=False, debug=False):
    '''
    Generate synthetic datasets for testing and benchmarking.
    '''
    configs = sorted(CONFIGS.keys(), key=(lambda c: -get_cost(CONFIGS[c])))
    parallel_map(generate_one, [
        (name, sample_count, force, debug) for name in configs
    ])
Beispiel #3
0
def test(sample_count=2, force=True, debug=False):
    '''
    Generate small synthetic datasets for testing.
    '''
    mkdir_p(loom.store.STORE)
    configs = sorted(TEST_CONFIGS, key=(lambda c: -get_cost(CONFIGS[c])))
    parallel_map(generate_one,
                 [(name, sample_count, force, debug) for name in configs])
Beispiel #4
0
def test(sample_count=2, force=True, debug=False):
    '''
    Generate small synthetic datasets for testing.
    '''
    mkdir_p(loom.store.STORE)
    configs = sorted(TEST_CONFIGS, key=(lambda c: -get_cost(CONFIGS[c])))
    parallel_map(generate_one, [
        (name, sample_count, force, debug) for name in configs
    ])
Beispiel #5
0
def infer(name, sample_count=10, config=None, debug=False):
    '''
    Infer samples in parallel.
    Arguments:
        name            A unique identifier for ingest + inference
        sample_count    The number of samples to draw, typically 10-100
        config          An optional json config file, e.g.,
                            {"schedule": {"extra_passes": 500.0}}
        debug           Whether to run debug versions of C++ code
    Environment variables:
        LOOM_THREADS    Number of concurrent inference tasks
        LOOM_VERBOSITY  Verbosity level
    '''
    assert sample_count >= 1, 'too few samples: {}'.format(sample_count)
    parallel_map(_infer_one, [
        (name, seed, config, debug) for seed in xrange(sample_count)
    ])
Beispiel #6
0
def infer(name,
          sample_count=DEFAULTS['sample_count'],
          config=None,
          debug=False):
    '''
    Infer samples in parallel.
    Arguments:
        name            A unique identifier for ingest + inference
        sample_count    The number of samples to draw, typically 10-100
        config          An optional json config file, e.g.,
                            {"schedule": {"extra_passes": 500.0}}
        debug           Whether to run debug versions of C++ code
    Environment variables:
        LOOM_THREADS    Number of concurrent inference tasks
        LOOM_VERBOSITY  Verbosity level
    '''
    if not (sample_count >= 1):
        raise LoomError('Too few samples: {}'.format(sample_count))
    parallel_map(_infer_one, [(name, seed, config, debug)
                              for seed in xrange(sample_count)])
Beispiel #7
0
def download(s3_url=S3_URL):
    '''
    Download dataset from S3 and load into loom.benchmark jig.
    '''
    import boto
    bucket, path = s3_split(s3_url)
    conn = boto.connect_s3().get_bucket(bucket)
    keys = [
        key.name for key in conn.list(path)
        if re.match(r'.*\d\d\d\.csv\.gz$', key.name)
    ]
    assert keys, 'nothing to download'
    files = [os.path.join(ROWS_CSV, os.path.basename(key)) for key in keys]
    tasks = [(bucket, source, destin) for source, destin in izip(keys, files)
             if not os.path.exists(destin)]
    if tasks:
        print 'starting download of {} files'.format(len(tasks))
        mkdir_p(ROWS_CSV)
        parallel_map(s3_get, tasks)
        print 'finished download of {} files'.format(len(keys))
Beispiel #8
0
def transform_rows(schema_in, transforms_in, rows_in, rows_out, id_field=None):
    transforms = pickle_load(transforms_in)
    if not transforms:
        cp_ns(rows_in, rows_out)
    else:
        transform = TransformSequence(transforms)
        transformed_header = sorted(json_load(schema_in).iterkeys())
        if id_field is not None:
            assert id_field not in transformed_header
            transformed_header = [id_field] + transformed_header
        tasks = []
        if os.path.isdir(rows_in):
            loom.util.mkdir_p(rows_out)
            for f in os.listdir(rows_in):
                tasks.append((
                    transform,
                    transformed_header,
                    os.path.join(rows_in, f),
                    os.path.join(rows_out, f),
                ))
        else:
            tasks.append((transform, transformed_header, rows_in, rows_out))
        parallel_map(_transform_rows, tasks)
Beispiel #9
0
def transform_rows(schema_in, transforms_in, rows_in, rows_out, id_field=None):
    transforms = pickle_load(transforms_in)
    if not transforms:
        cp_ns(rows_in, rows_out)
    else:
        transform = TransformSequence(transforms)
        transformed_header = sorted(json_load(schema_in).iterkeys())
        if id_field is not None:
            assert id_field not in transformed_header
            transformed_header = [id_field] + transformed_header
        tasks = []
        if os.path.isdir(rows_in):
            loom.util.mkdir_p(rows_out)
            for f in os.listdir(rows_in):
                tasks.append((
                    transform,
                    transformed_header,
                    os.path.join(rows_in, f),
                    os.path.join(rows_out, f),
                ))
        else:
            tasks.append((transform, transformed_header, rows_in, rows_out))
        parallel_map(_transform_rows, tasks)
Beispiel #10
0
def download(s3_url=S3_URL):
    '''
    Download dataset from S3 and load into loom.benchmark jig.
    '''
    import boto
    bucket, path = s3_split(s3_url)
    conn = boto.connect_s3().get_bucket(bucket)
    keys = [
        key.name
        for key in conn.list(path)
        if re.match(r'.*\d\d\d\.csv\.gz$', key.name)
    ]
    assert keys, 'nothing to download'
    files = [os.path.join(ROWS_CSV, os.path.basename(key)) for key in keys]
    tasks = [
        (bucket, source, destin)
        for source, destin in izip(keys, files)
        if not os.path.exists(destin)
    ]
    if tasks:
        print 'starting download of {} files'.format(len(tasks))
        mkdir_p(ROWS_CSV)
        parallel_map(s3_get, tasks)
        print 'finished download of {} files'.format(len(keys))
Beispiel #11
0
def init():
    '''
    Generate synthetic datasets for testing and benchmarking.
    '''
    configs = sorted(CONFIGS.keys(), key=(lambda c: -get_cost(CONFIGS[c])))
    parallel_map(load_one, configs)