Ejemplo n.º 1
0
def pretty_print(filename, message_type='guess'):
    '''
    Print text/json/protobuf messages from a raw/gz/bz2 file.
    '''
    parts = os.path.basename(filename).split('.')
    if parts[-1] in ['gz', 'bz2']:
        parts.pop()
    protocol = parts[-1]
    if protocol == 'json':
        data = json_load(filename)
        print json.dumps(data, sort_keys=True, indent=4)
    elif protocol == 'pb':
        message = get_message(filename, message_type)
        with open_compressed(filename) as f:
            message.ParseFromString(f.read())
            print message
    elif protocol == 'pbs':
        message = get_message(filename, message_type)
        for string in protobuf_stream_load(filename):
            message.ParseFromString(string)
            print message
    elif protocol == 'pickle':
        data = pickle_load(filename)
        print repr(data)
    else:
        with open_compressed(filename) as f:
            for line in f:
                print line,
Ejemplo n.º 2
0
def import_rows():
    row = Row()
    pos = row.diff.pos
    neg = row.diff.neg
    pos.observed.sparsity = ProductValue.Observed.SPARSE
    neg.observed.sparsity = ProductValue.Observed.SPARSE
    with open_compressed(RAW) as infile:
        doc_count = int(infile.next())
        word_count = int(infile.next())
        observed_count = int(infile.next())
        print 'Importing {} observations of {} words in {} documents'.format(
            observed_count, word_count, doc_count)
        with open_compressed(DIFFS, 'wb') as outfile:
            current_doc = None
            for line in infile:
                doc, feature, count = line.split()
                if doc != current_doc:
                    if current_doc is not None:
                        pos.observed.sparse.sort()
                        neg.observed.sparse.sort()
                        protobuf_stream_write(row.SerializeToString(), outfile)
                        print_dot(every=1000)
                    current_doc = doc
                    row.id = int(doc)
                    del pos.booleans[:]
                    del pos.observed.sparse[:]
                    del neg.booleans[:]
                    del neg.observed.sparse[:]
                feature = int(feature) - 1
                pos.observed.sparse.append(feature)
                pos.booleans.append(True)
                neg.observed.sparse.append(feature)
                neg.booleans.append(False)
            protobuf_stream_write(row.SerializeToString(), outfile)
Ejemplo n.º 3
0
def test_predict(root, rows_csv, encoding, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        with loom.preql.get_server(root, debug=True) as preql:
            result_out = 'predictions_out.csv'
            rows_in = os.listdir(rows_csv)[0]
            rows_in = os.path.join(rows_csv, rows_in)
            preql.predict(rows_in, COUNT, result_out, id_offset=True)
            print 'DEBUG', open_compressed(rows_in).read()
            print 'DEBUG', open_compressed(result_out).read()
            _check_predictions(rows_in, result_out, encoding)
Ejemplo n.º 4
0
def test_predict(root, rows_csv, encoding, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        with loom.preql.get_server(root, debug=True) as preql:
            result_out = 'predictions_out.csv'
            rows_in = os.listdir(rows_csv)[0]
            rows_in = os.path.join(rows_csv, rows_in)
            preql.predict(rows_in, COUNT, result_out, id_offset=True)
            print 'DEBUG', open_compressed(rows_in).read()
            print 'DEBUG', open_compressed(result_out).read()
            _check_predictions(rows_in, result_out, encoding)
Ejemplo n.º 5
0
def force_ascii(filename_in, filename_out=None, size=4096):
    with ExitStack() as stack:
        with_ = stack.enter_context
        if filename_out is None:
            filename_out = with_(loom.util.temp_copy(filename_in))
        source = with_(open_compressed(filename_in, 'rb'))
        destin = with_(open_compressed(filename_out, 'w'))
        chunk = source.read(size)
        while chunk:
            destin.write(re_nonascii.sub('', chunk))
            chunk = source.read(size)
Ejemplo n.º 6
0
def config_dump(config, filename):
    config = deepcopy(config)
    fill_in_defaults(config)
    message = loom.schema_pb2.Config()
    protobuf_dump(config, message)
    with open_compressed(filename, 'wb') as f:
        f.write(message.SerializeToString())
Ejemplo n.º 7
0
def test_predict(root, rows_csv, encoding, **unused):
    COUNT = 10
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        with loom.query.get_server(root, debug=True) as query_server:
            result_out = 'predictions_out.csv'
            rows_in = os.listdir(rows_csv)[0]
            rows_in = os.path.join(rows_csv, rows_in)
            encoders = json_load(encoding)
            name_to_encoder = {e['name']: load_encoder(e) for e in encoders}
            preql = loom.preql.PreQL(query_server, encoding)
            preql.predict(rows_in, COUNT, result_out, id_offset=False)
            with open_compressed(rows_in, 'rb') as fin:
                with open(result_out, 'r') as fout:
                    in_reader = csv.reader(fin)
                    out_reader = csv.reader(fout)
                    fnames = in_reader.next()
                    out_reader.next()
                    for in_row in in_reader:
                        for i in range(COUNT):
                            out_row = out_reader.next()
                            bundle = zip(fnames, in_row, out_row)
                            for name, in_val, out_val in bundle:
                                encode = name_to_encoder[name]
                                observed = bool(in_val.strip())
                                if observed:
                                    assert_almost_equal(
                                        encode(in_val),
                                        encode(out_val))
                                else:
                                    assert_true(bool(out_val.strip()))
Ejemplo n.º 8
0
def _check_predictions(rows_in, result_out, encoding):
    encoders = json_load(encoding)
    name_to_encoder = {e['name']: load_encoder(e) for e in encoders}
    with open_compressed(rows_in, 'rb') as fin:
        with open(result_out, 'r') as fout:
            in_reader = csv.reader(fin)
            out_reader = csv.reader(fout)
            fnames = in_reader.next()
            out_reader.next()
            for in_row in in_reader:
                for i in range(COUNT):
                    out_row = out_reader.next()
                    bundle = zip(fnames, in_row, out_row)
                    for name, in_val, out_val in bundle:
                        if name == '_id':
                            assert_equal(in_val, out_val)
                            continue
                        encode = name_to_encoder[name]
                        observed = bool(in_val.strip())
                        if observed:
                            assert_almost_equal(
                                encode(in_val),
                                encode(out_val))
                        else:
                            assert_true(bool(out_val.strip()))
Ejemplo n.º 9
0
def config_dump(config, filename):
    config = deepcopy(config)
    fill_in_defaults(config)
    message = loom.schema_pb2.Config()
    protobuf_dump(config, message)
    with open_compressed(filename, 'wb') as f:
        f.write(message.SerializeToString())
Ejemplo n.º 10
0
def get_example_requests(model):
    cross_cat = CrossCat()
    with open_compressed(model) as f:
        cross_cat.ParseFromString(f.read())
    feature_count = sum(len(kind.featureids) for kind in cross_cat.kinds)

    all_observed = [True] * feature_count
    none_observed = [False] * feature_count
    observeds = []
    observeds.append(all_observed)
    for f in xrange(feature_count):
        observed = all_observed[:]
        observed[f] = False
        observeds.append(observed)
    for f in xrange(feature_count):
        observed = [sample_bernoulli(0.5) for _ in xrange(feature_count)]
        observeds.append(observed)
    for f in xrange(feature_count):
        observed = none_observed[:]
        observed[f] = True
        observeds.append(observed)
    observeds.append(none_observed)

    requests = []
    for i, observed in enumerate(observeds):
        request = Query.Request()
        request.id = "example-{}".format(i)
        request.sample.data.observed[:] = none_observed
        request.sample.to_sample[:] = observed
        request.sample.sample_count = 1
        requests.append(request)

    return requests
Ejemplo n.º 11
0
def _import_rows_file(args):
    encoding_in, rows_csv_in, rows_out, id_offset, id_stride = args
    assert os.path.isfile(rows_csv_in)
    encoders = json_load(encoding_in)
    message = loom.cFormat.Row()
    add_field = {
        'booleans': message.add_booleans,
        'counts': message.add_counts,
        'reals': message.add_reals,
    }
    with open_compressed(rows_csv_in, 'rb') as f:
        reader = csv.reader(f)
        feature_names = list(reader.next())
        name_to_pos = {name: i for i, name in enumerate(feature_names)}
        schema = []
        for encoder in encoders:
            pos = name_to_pos.get(encoder['name'])
            add = add_field[loom.schema.MODEL_TO_DATATYPE[encoder['model']]]
            encode = load_encoder(encoder)
            schema.append((pos, add, encode))

        def rows():
            for i, row in enumerate(reader):
                message.id = id_offset + id_stride * i
                for pos, add, encode in schema:
                    value = None if pos is None else row[pos].strip()
                    observed = bool(value)
                    message.add_observed(observed)
                    if observed:
                        add(encode(value))
                yield message
                message.Clear()

        loom.cFormat.row_stream_dump(rows(), rows_out)
Ejemplo n.º 12
0
def make_fake_encoding(schema_in, model_in, encoding_out):
    '''
    Make a fake encoding from json schema + model.
    Assume that feature names in schema correspond to featureids in model
    e.g. schema was generated from loom.format.make_schema
    '''
    schema = json_load(schema_in)
    fields = []
    builders = []
    name_to_builder = {}
    for name, model in sorted(schema.iteritems()):
        fields.append(loom.schema.MODEL_TO_DATATYPE[model])
        Builder = FAKE_ENCODER_BUILDERS[model]
        builder = Builder(name, model)
        builders.append(builder)
        name_to_builder[name] = builder

    cross_cat = loom.schema_pb2.CrossCat()
    with open_compressed(model_in, 'rb') as f:
        cross_cat.ParseFromString(f.read())
    for kind in cross_cat.kinds:
        featureid = iter(kind.featureids)
        for model in loom.schema.MODELS.iterkeys():
            for shared in getattr(kind.product_model, model):
                feature_name = '{:06d}'.format(featureid.next())
                assert feature_name in schema
                if model == 'dd':
                    for i in range(len(shared.alphas)):
                        name_to_builder[feature_name].add_value(str(i))
                elif model == 'dpd':
                    for val in shared.values:
                        name_to_builder[feature_name].add_value(str(val))
    encoders = [b.build() for b in builders]
    ensure_fake_encoders_are_sorted(encoders)
    json_dump(encoders, encoding_out)
Ejemplo n.º 13
0
def make_fake_encoding(schema_in, model_in, encoding_out):
    '''
    Make a fake encoding from json schema + model.
    Assume that feature names in schema correspond to featureids in model
    e.g. schema was generated from loom.format.make_schema
    '''
    schema = json_load(schema_in)
    fields = []
    builders = []
    name_to_builder = {}
    for name, model in sorted(schema.iteritems()):
        fields.append(loom.schema.MODEL_TO_DATATYPE[model])
        Builder = FAKE_ENCODER_BUILDERS[model]
        builder = Builder(name, model)
        builders.append(builder)
        name_to_builder[name] = builder

    cross_cat = loom.schema_pb2.CrossCat()
    with open_compressed(model_in, 'rb') as f:
        cross_cat.ParseFromString(f.read())
    for kind in cross_cat.kinds:
        featureid = iter(kind.featureids)
        for model in loom.schema.MODELS.iterkeys():
            for shared in getattr(kind.product_model, model):
                feature_name = '{:06d}'.format(featureid.next())
                assert feature_name in schema
                if model == 'dd':
                    for i in range(len(shared.alphas)):
                        name_to_builder[feature_name].add_value(str(i))
                elif model == 'dpd':
                    for val in shared.values:
                        name_to_builder[feature_name].add_value(str(val))
    encoders = [b.build() for b in builders]
    ensure_fake_encoders_are_sorted(encoders)
    json_dump(encoders, encoding_out)
Ejemplo n.º 14
0
def ingest(name, schema=None, rows_csv=None, id_field=None, debug=False):
    '''
    Ingest dataset with optional json config.
    Arguments:
        name            A unique identifier for ingest + inference
        schema          Json schema file, e.g., {"feature1": "nich"}
        rows_csv        File or directory of csv files or csv.gz files
        id_field        Column name of id field in input csv
        debug           Whether to run debug versions of C++ code
    Environment variables:
        LOOM_THREADS    Number of concurrent ingest tasks
        LOOM_VERBOSITY  Verbosity level
    '''
    paths = loom.store.get_paths(name)
    if schema is None:
        schema = paths['ingest']['schema']
    if rows_csv is None:
        rows_csv = paths['ingest']['rows_csv']
    if not os.path.exists(schema):
        raise LoomError('Missing schema file: {}'.format(schema))
    if not os.path.exists(rows_csv):
        raise LoomError('Missing rows_csv file: {}'.format(rows_csv))

    with open_compressed(paths['ingest']['version'], 'w') as f:
        f.write(loom.__version__)

    LOG('making schema row')
    loom.format.make_schema_row(schema_in=schema,
                                schema_row_out=paths['ingest']['schema_row'])

    LOG('making encoding')
    loom.format.make_encoding(schema_in=schema,
                              rows_in=rows_csv,
                              encoding_out=paths['ingest']['encoding'])

    LOG('importing rows')
    loom.format.import_rows(encoding_in=paths['ingest']['encoding'],
                            rows_csv_in=rows_csv,
                            rows_out=paths['ingest']['rows'])

    LOG('importing rowids')
    loom.format.import_rowids(rows_csv_in=rows_csv,
                              rowids_out=paths['ingest']['rowids'],
                              id_field=id_field)

    LOG('making tare rows')
    loom.runner.tare(schema_row_in=paths['ingest']['schema_row'],
                     rows_in=paths['ingest']['rows'],
                     tares_out=paths['ingest']['tares'],
                     debug=debug)

    tare_count = sum(1 for _ in protobuf_stream_load(paths['ingest']['tares']))
    LOG('sparsifying rows WRT {} tare rows'.format(tare_count))
    loom.runner.sparsify(schema_row_in=paths['ingest']['schema_row'],
                         tares_in=paths['ingest']['tares'],
                         rows_in=paths['ingest']['rows'],
                         rows_out=paths['ingest']['diffs'],
                         debug=debug)
    loom.config.config_dump({}, paths['query']['config'])
Ejemplo n.º 15
0
def _loom_cross_cat(path, sample):
    """Return the loom CrossCat structure at `path`, whose id is `sample`."""
    model_in = os.path.join(path, 'samples', 'sample.%d' % (sample, ),
                            'model.pb.gz')
    cross_cat = loom.schema_pb2.CrossCat()
    with open_compressed(model_in, 'rb') as f:
        cross_cat.ParseFromString(f.read())
    return cross_cat
Ejemplo n.º 16
0
 def predict(self, rows_csv, count, result_out, id_offset=True):
     with open_compressed(rows_csv, 'rb') as fin:
         with open_compressed(result_out, 'w') as fout:
             reader = csv.reader(fin)
             writer = csv.writer(fout)
             feature_names = list(reader.next())
             writer.writerow(feature_names)
             name_to_pos = {name: i for i, name in enumerate(feature_names)}
             pos_to_decode = {}
             schema = []
             for encoder in self.encoders:
                 pos = name_to_pos.get(encoder['name'])
                 encode = load_encoder(encoder)
                 decode = load_decoder(encoder)
                 if pos is not None:
                     pos_to_decode[pos] = decode
                 schema.append((pos, encode))
             for row in reader:
                 conditioning_row = []
                 to_sample = []
                 if id_offset:
                     row_id = row.pop(0)
                 for pos, encode, in schema:
                     value = None if pos is None else row[pos].strip()
                     observed = bool(value)
                     to_sample.append((not observed))
                     if observed is False:
                         conditioning_row.append(None)
                     else:
                         conditioning_row.append(encode(value))
                 samples = self.query_server.sample(
                     to_sample,
                     conditioning_row,
                     count)
                 for sample in samples:
                     if id_offset:
                         out_row = [row_id]
                     else:
                         out_row = []
                     for name in feature_names:
                         pos = name_to_pos[name]
                         decode = pos_to_decode[pos]
                         val = sample[pos]
                         out_row.append(val)
                     writer.writerow(out_row)
Ejemplo n.º 17
0
def csv_output(arg):
    if arg is None:
        outfile = StringIO()
        yield CsvWriter(outfile, returns=outfile.getvalue)
    elif hasattr(arg, 'write'):
        yield CsvWriter(arg)
    else:
        with open_compressed(arg, 'w') as outfile:
            yield CsvWriter(outfile)
Ejemplo n.º 18
0
 def _get_cross_cat(self, bdb, generator_id, modelno):
     """Return the loom CrossCat structure whose id is `modelno`."""
     model_in = os.path.join(
         self._get_loom_project_path(bdb, generator_id),
         'samples', 'sample.%d' % (modelno,), 'model.pb.gz')
     cross_cat = loom.schema_pb2.CrossCat()
     with open_compressed(model_in, 'rb') as f:
         cross_cat.ParseFromString(f.read())
     return cross_cat
Ejemplo n.º 19
0
 def _get_cross_cat(self, bdb, generator_id, modelno):
     """Return the loom CrossCat structure whose id is `modelno`."""
     model_in = os.path.join(self._get_loom_project_path(bdb, generator_id),
                             'samples', 'sample.%d' % (modelno, ),
                             'model.pb.gz')
     cross_cat = loom.schema_pb2.CrossCat()
     with open_compressed(model_in, 'rb') as f:
         cross_cat.ParseFromString(f.read())
     return cross_cat
Ejemplo n.º 20
0
def csv_output(arg):
    if arg is None:
        outfile = StringIO()
        yield CsvWriter(outfile, returns=outfile.getvalue)
    elif hasattr(arg, 'write'):
        yield CsvWriter(arg)
    else:
        with open_compressed(arg, 'w') as outfile:
            yield CsvWriter(outfile)
Ejemplo n.º 21
0
def group_sample((sample, featureid)):
    model = CrossCat()
    with open_compressed(sample['model']) as f:
        model.ParseFromString(f.read())
    for kindid, kind in enumerate(model.kinds):
        if featureid in kind.featureids:
            break
    assignments = assignment_stream_load(sample['assign'])
    return collate((a.groupids(kindid), a.rowid) for a in assignments)
Ejemplo n.º 22
0
def group_sample((sample, featureid)):
    model = CrossCat()
    with open_compressed(sample['model']) as f:
        model.ParseFromString(f.read())
    for kindid, kind in enumerate(model.kinds):
        if featureid in kind.featureids:
            break
    assignments = assignment_stream_load(sample['assign'])
    return collate((a.groupids(kindid), a.rowid) for a in assignments)
Ejemplo n.º 23
0
def generate_init(encoding_in, model_out, seed=0):
    '''
    Generate an initial model for inference.
    '''
    numpy.random.seed(seed)
    encoders = json_load(encoding_in)
    features = import_features(encoders)
    cross_cat = generate_model(features)
    with open_compressed(model_out, 'wb') as f:
        f.write(cross_cat.SerializeToString())
Ejemplo n.º 24
0
def generate_init(encoding_in, model_out, seed=0):
    '''
    Generate an initial model for inference.
    '''
    numpy.random.seed(seed)
    encoders = json_load(encoding_in)
    features = import_features(encoders)
    cross_cat = generate_model(features)
    with open_compressed(model_out, 'wb') as f:
        f.write(cross_cat.SerializeToString())
Ejemplo n.º 25
0
def ingest(name, schema='schema.json', rows_csv='rows.csv.gz', debug=False):
    '''
    Ingest dataset with optional json config.
    Arguments:
        name            A unique identifier for ingest + inference
        schema          Json schema file, e.g., {"feature1": "nich"}
        rows_csv        File or directory of csv files
        debug           Whether to run debug versions of C++ code
    Environment variables:
        LOOM_THREADS    Number of concurrent ingest tasks
        LOOM_VERBOSITY  Verbosity level
    '''
    if not os.path.exists(schema):
        raise IOError('Missing schema file: {}'.format(schema))
    if not os.path.exists(rows_csv):
        raise IOError('Missing rows_csv file: {}'.format(rows_csv))

    paths = loom.store.get_paths(name)
    with open_compressed(paths['ingest']['version'], 'w') as f:
        f.write(loom.__version__)

    LOG('making schema row')
    loom.format.make_schema_row(
        schema_in=schema,
        schema_row_out=paths['ingest']['schema_row'])

    LOG('making encoding')
    loom.format.make_encoding(
        schema_in=schema,
        rows_in=rows_csv,
        encoding_out=paths['ingest']['encoding'])

    LOG('importing rows')
    loom.format.import_rows(
        encoding_in=paths['ingest']['encoding'],
        rows_csv_in=rows_csv,
        rows_out=paths['ingest']['rows'])

    LOG('making tare rows')
    loom.runner.tare(
        schema_row_in=paths['ingest']['schema_row'],
        rows_in=paths['ingest']['rows'],
        tares_out=paths['ingest']['tares'],
        debug=debug)

    tare_count = sum(1 for _ in protobuf_stream_load(paths['ingest']['tares']))
    LOG('sparsifying rows WRT {} tare rows'.format(tare_count))
    loom.runner.sparsify(
        schema_row_in=paths['ingest']['schema_row'],
        tares_in=paths['ingest']['tares'],
        rows_in=paths['ingest']['rows'],
        rows_out=paths['ingest']['diffs'],
        debug=debug)
Ejemplo n.º 26
0
def make_schema_row(schema_in, schema_row_out):
    '''
    Convert json schema to protobuf schema row.
    '''
    schema = json_load(schema_in)
    value = loom.schema_pb2.ProductValue()
    value.observed.sparsity = loom.schema_pb2.ProductValue.Observed.DENSE
    for model in schema.itervalues():
        field = loom.schema.MODEL_TO_DATATYPE[model]
        value.observed.dense.append(True)
        getattr(value, field).append(EXAMPLE_VALUES[field])
    with open_compressed(schema_row_out, 'wb') as f:
        f.write(value.SerializeToString())
Ejemplo n.º 27
0
def test_infer(name, tares, shuffled, init, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        row_count = sum(1 for _ in protobuf_stream_load(shuffled))
        with open_compressed(init) as f:
            message = CrossCat()
            message.ParseFromString(f.read())
        kind_count = len(message.kinds)

        for config in CONFIGS:
            loom.config.fill_in_defaults(config)
            schedule = config['schedule']
            print 'config: {}'.format(config)

            greedy = (schedule['extra_passes'] == 0)
            kind_iters = config['kernels']['kind']['iterations']
            kind_structure_is_fixed = greedy or kind_iters == 0

            with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
                config_in = os.path.abspath('config.pb.gz')
                model_out = os.path.abspath('model.pb.gz')
                groups_out = os.path.abspath('groups')
                assign_out = os.path.abspath('assign.pbs.gz')
                log_out = os.path.abspath('log.pbs.gz')
                os.mkdir(groups_out)
                loom.config.config_dump(config, config_in)
                loom.runner.infer(
                    config_in=config_in,
                    rows_in=shuffled,
                    tares_in=tares,
                    model_in=init,
                    model_out=model_out,
                    groups_out=groups_out,
                    assign_out=assign_out,
                    log_out=log_out,
                    debug=True,)

                if kind_structure_is_fixed:
                    assert_equal(len(os.listdir(groups_out)), kind_count)

                group_counts = get_group_counts(groups_out)

                assign_count = sum(1 for _ in protobuf_stream_load(assign_out))
                assert_equal(assign_count, row_count)

            print 'row_count: {}'.format(row_count)
            print 'group_counts: {}'.format(' '.join(map(str, group_counts)))
            for group_count in group_counts:
                assert_true(
                    group_count <= row_count,
                    'groups are all singletons')
Ejemplo n.º 28
0
def test_infer(name, tares, shuffled, init, **unused):
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        row_count = sum(1 for _ in protobuf_stream_load(shuffled))
        with open_compressed(init) as f:
            message = CrossCat()
            message.ParseFromString(f.read())
        kind_count = len(message.kinds)

        for config in CONFIGS:
            loom.config.fill_in_defaults(config)
            schedule = config['schedule']
            print 'config: {}'.format(config)

            greedy = (schedule['extra_passes'] == 0)
            kind_iters = config['kernels']['kind']['iterations']
            kind_structure_is_fixed = greedy or kind_iters == 0

            with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
                config_in = os.path.abspath('config.pb.gz')
                model_out = os.path.abspath('model.pb.gz')
                groups_out = os.path.abspath('groups')
                assign_out = os.path.abspath('assign.pbs.gz')
                log_out = os.path.abspath('log.pbs.gz')
                os.mkdir(groups_out)
                loom.config.config_dump(config, config_in)
                loom.runner.infer(
                    config_in=config_in,
                    rows_in=shuffled,
                    tares_in=tares,
                    model_in=init,
                    model_out=model_out,
                    groups_out=groups_out,
                    assign_out=assign_out,
                    log_out=log_out,
                    debug=True,
                )

                if kind_structure_is_fixed:
                    assert_equal(len(os.listdir(groups_out)), kind_count)

                group_counts = get_group_counts(groups_out)

                assign_count = sum(1 for _ in protobuf_stream_load(assign_out))
                assert_equal(assign_count, row_count)

            print 'row_count: {}'.format(row_count)
            print 'group_counts: {}'.format(' '.join(map(str, group_counts)))
            for group_count in group_counts:
                assert_true(group_count <= row_count,
                            'groups are all singletons')
Ejemplo n.º 29
0
Archivo: main.py Proyecto: fritzo/loom
def create_dataset(row_count=ROW_COUNT):
    '''
    Extract dataset from image.
    '''
    scipy.misc.imsave(os.path.join(RESULTS, 'original.png'), IMAGE)
    print 'sampling {} points from image'.format(row_count)
    with open_compressed(SAMPLES, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['x', 'y'])
        for row in sample_from_image(IMAGE, row_count):
            writer.writerow(row)
    with csv_reader(SAMPLES) as reader:
        reader.next()
        image = visualize_dataset(map(float, row) for row in reader)
    scipy.misc.imsave(os.path.join(RESULTS, 'samples.png'), image)
Ejemplo n.º 30
0
def create_dataset(row_count=ROW_COUNT):
    '''
    Extract dataset from image.
    '''
    scipy.misc.imsave(os.path.join(RESULTS, 'original.png'), IMAGE)
    print 'sampling {} points from image'.format(row_count)
    with open_compressed(SAMPLES, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['x', 'y'])
        for row in sample_from_image(IMAGE, row_count):
            writer.writerow(row)
    with csv_reader(SAMPLES) as reader:
        reader.next()
        image = visualize_dataset(map(float, row) for row in reader)
    scipy.misc.imsave(os.path.join(RESULTS, 'samples.png'), image)
Ejemplo n.º 31
0
def import_rows():
    row = Row()
    pos = row.diff.pos
    neg = row.diff.neg
    pos.observed.sparsity = ProductValue.Observed.SPARSE
    neg.observed.sparsity = ProductValue.Observed.SPARSE
    with open_compressed(RAW) as infile:
        doc_count = int(infile.next())
        word_count = int(infile.next())
        observed_count = int(infile.next())
        print 'Importing {} observations of {} words in {} documents'.format(
            observed_count,
            word_count,
            doc_count)
        with open_compressed(DIFFS, 'wb') as outfile:
            current_doc = None
            for line in infile:
                doc, feature, count = line.split()
                if doc != current_doc:
                    if current_doc is not None:
                        pos.observed.sparse.sort()
                        neg.observed.sparse.sort()
                        protobuf_stream_write(row.SerializeToString(), outfile)
                        print_dot(every=1000)
                    current_doc = doc
                    row.id = int(doc)
                    del pos.booleans[:]
                    del pos.observed.sparse[:]
                    del neg.booleans[:]
                    del neg.observed.sparse[:]
                feature = int(feature) - 1
                pos.observed.sparse.append(feature)
                pos.booleans.append(True)
                neg.observed.sparse.append(feature)
                neg.booleans.append(False)
            protobuf_stream_write(row.SerializeToString(), outfile)
Ejemplo n.º 32
0
def generate(
        feature_type='mixed',
        row_count=1000,
        feature_count=100,
        density=0.5,
        rows_out='rows.pbs.gz',
        model_out='model.pb.gz',
        groups_out=None,
        assign_out=None,
        init_out=None,
        debug=False,
        profile=None):
    '''
    Generate a synthetic dataset.
    '''
    root = os.getcwd()
    rows_out = os.path.abspath(rows_out)
    model_out = os.path.abspath(model_out)
    if groups_out is not None:
        groups_out = os.path.abspath(groups_out)
    if assign_out is not None:
        assign_out = os.path.abspath(assign_out)
    if init_out is not None:
        init_out = os.path.abspath(init_out)

    features = generate_features(feature_count, feature_type)
    model = generate_model(features)

    with tempdir(cleanup_on_error=(not debug)):
        if init_out is None:
            init_out = os.path.abspath('init.pb.gz')
        with open_compressed(init_out, 'wb') as f:
            f.write(model.SerializeToString())

        config = {'generate': {'row_count': row_count, 'density': density}}
        config_in = os.path.abspath('config.pb.gz')
        loom.config.config_dump(config, config_in)

        os.chdir(root)
        loom.runner.generate(
            config_in=config_in,
            model_in=init_out,
            rows_out=rows_out,
            model_out=model_out,
            groups_out=groups_out,
            assign_out=assign_out,
            debug=debug,
            profile=profile)
Ejemplo n.º 33
0
def make_schema(model_in, schema_out):
    '''
    Make a schema from a protobuf model.
    '''
    cross_cat = loom.schema_pb2.CrossCat()
    with open_compressed(model_in, 'rb') as f:
        cross_cat.ParseFromString(f.read())
    schema = {}
    for kind in cross_cat.kinds:
        featureid = iter(kind.featureids)
        for model in loom.schema.MODELS.iterkeys():
            for shared in getattr(kind.product_model, model):
                feature_name = '{:06d}'.format(featureid.next())
                schema[feature_name] = model
    json_dump(schema, schema_out)
    return schema
Ejemplo n.º 34
0
def make_schema(model_in, schema_out):
    '''
    Make a schema from a protobuf model.
    '''
    cross_cat = loom.schema_pb2.CrossCat()
    with open_compressed(model_in, 'rb') as f:
        cross_cat.ParseFromString(f.read())
    schema = {}
    for kind in cross_cat.kinds:
        featureid = iter(kind.featureids)
        for model in loom.schema.MODELS.iterkeys():
            for shared in getattr(kind.product_model, model):
                feature_name = '{:06d}'.format(featureid.next())
                schema[feature_name] = model
    json_dump(schema, schema_out)
    return schema
Ejemplo n.º 35
0
def generate(feature_type='mixed',
             row_count=1000,
             feature_count=100,
             density=0.5,
             rows_out='rows.pbs.gz',
             model_out='model.pb.gz',
             groups_out=None,
             assign_out=None,
             init_out=None,
             debug=False,
             profile=None):
    '''
    Generate a synthetic dataset.
    '''
    root = os.getcwd()
    rows_out = os.path.abspath(rows_out)
    model_out = os.path.abspath(model_out)
    if groups_out is not None:
        groups_out = os.path.abspath(groups_out)
    if assign_out is not None:
        assign_out = os.path.abspath(assign_out)
    if init_out is not None:
        init_out = os.path.abspath(init_out)

    features = generate_features(feature_count, feature_type)
    model = generate_model(features)

    with tempdir(cleanup_on_error=(not debug)):
        if init_out is None:
            init_out = os.path.abspath('init.pb.gz')
        with open_compressed(init_out, 'wb') as f:
            f.write(model.SerializeToString())

        config = {'generate': {'row_count': row_count, 'density': density}}
        config_in = os.path.abspath('config.pb.gz')
        loom.config.config_dump(config, config_in)

        os.chdir(root)
        loom.runner.generate(config_in=config_in,
                             model_in=init_out,
                             rows_out=rows_out,
                             model_out=model_out,
                             groups_out=groups_out,
                             assign_out=assign_out,
                             debug=debug,
                             profile=profile)
Ejemplo n.º 36
0
def make_schema_row(schema_in, schema_row_out):
    '''
    Convert json schema to protobuf schema row.
    '''
    schema = json_load(schema_in)
    if not schema:
        raise LoomError('Schema is empty: {}'.format(schema_in))
    value = loom.schema_pb2.ProductValue()
    value.observed.sparsity = loom.schema_pb2.ProductValue.Observed.DENSE
    for model in schema.itervalues():
        try:
            field = loom.schema.MODEL_TO_DATATYPE[model]
        except KeyError:
            raise LoomError('Unknown model {} in schema {}'.format(
                model, schema_in))
        value.observed.dense.append(True)
        getattr(value, field).append(EXAMPLE_VALUES[field])
    with open_compressed(schema_row_out, 'wb') as f:
        f.write(value.SerializeToString())
Ejemplo n.º 37
0
def make_schema_row(schema_in, schema_row_out):
    '''
    Convert json schema to protobuf schema row.
    '''
    schema = json_load(schema_in)
    if not schema:
        raise LoomError('Schema is empty: {}'.format(schema_in))
    value = loom.schema_pb2.ProductValue()
    value.observed.sparsity = loom.schema_pb2.ProductValue.Observed.DENSE
    for model in schema.itervalues():
        try:
            field = loom.schema.MODEL_TO_DATATYPE[model]
        except KeyError:
            raise LoomError('Unknown model {} in schema {}'.format(
                model, schema_in))
        value.observed.dense.append(True)
        getattr(value, field).append(EXAMPLE_VALUES[field])
    with open_compressed(schema_row_out, 'wb') as f:
        f.write(value.SerializeToString())
Ejemplo n.º 38
0
def test_predict_pandas(root, rows_csv, schema, **unused):
    feature_count = len(json_load(schema))
    with loom.preql.get_server(root, debug=True) as preql:
        rows_filename = os.path.join(rows_csv, os.listdir(rows_csv)[0])
        with open_compressed(rows_filename) as f:
            rows_df = pandas.read_csv(f,
                                      converters=preql.converters,
                                      index_col='_id')
        print 'rows_df ='
        print rows_df
        row_count = rows_df.shape[0]
        assert_equal(rows_df.shape[1], feature_count)
        rows_io = StringIO(rows_df.to_csv())
        result_string = preql.predict(rows_io, COUNT, id_offset=True)
        result_df = pandas.read_csv(StringIO(result_string), index_col=False)
        print 'result_df ='
        print result_df
        assert_equal(result_df.ndim, 2)
        assert_equal(result_df.shape[0], row_count * COUNT)
        assert_equal(result_df.shape[1], 1 + feature_count)
Ejemplo n.º 39
0
def test_predict_pandas(root, rows_csv, schema, **unused):
    feature_count = len(json_load(schema))
    with loom.preql.get_server(root, debug=True) as preql:
        rows_filename = os.path.join(rows_csv, os.listdir(rows_csv)[0])
        with open_compressed(rows_filename) as f:
            rows_df = pandas.read_csv(
                f,
                converters=preql.converters,
                index_col='_id')
        print 'rows_df ='
        print rows_df
        row_count = rows_df.shape[0]
        assert_equal(rows_df.shape[1], feature_count)
        rows_io = StringIO(rows_df.to_csv())
        result_string = preql.predict(rows_io, COUNT, id_offset=True)
        result_df = pandas.read_csv(StringIO(result_string), index_col=False)
        print 'result_df ='
        print result_df
        assert_equal(result_df.ndim, 2)
        assert_equal(result_df.shape[0], row_count * COUNT)
        assert_equal(result_df.shape[1], 1 + feature_count)
Ejemplo n.º 40
0
def _make_encoder_builders_file((schema_in, rows_in)):
    assert os.path.isfile(rows_in)
    schema = json_load(schema_in)
    with open_compressed(rows_in, 'rb') as f:
        reader = csv.reader(f)
        header = reader.next()
        builders = []
        for name in header:
            if name in schema:
                model = schema[name]
                Builder = ENCODER_BUILDERS[model]
                builder = Builder(name, model)
            else:
                builder = None
            builders.append(builder)
        for row in reader:
            for value, builder in izip(row, builders):
                if builder is not None:
                    value = value.strip()
                    if value:
                        builder.add_value(value)
    return [b for b in builders if b is not None]
Ejemplo n.º 41
0
def generate(
        feature_type='mixed',
        row_count=1000,
        feature_count=100,
        density=0.5,
        init_out='init.pb.gz',
        rows_out='rows.pbs.gz',
        model_out='model.pb.gz',
        groups_out='groups',
        debug=False,
        profile=None):
    '''
    Generate a synthetic dataset.
    '''
    root = os.path.abspath(os.path.curdir)
    init_out = os.path.abspath(init_out)
    rows_out = os.path.abspath(rows_out)
    model_out = os.path.abspath(model_out)
    groups_out = os.path.abspath(groups_out)

    model = generate_model(row_count, feature_count, feature_type, density)
    with open_compressed(init_out, 'w') as f:
        f.write(model.SerializeToString())

    with tempdir(cleanup_on_error=(not debug)):
        config = {'generate': {'row_count': row_count, 'density': density}}
        config_in = os.path.abspath('config.pb.gz')
        loom.config.config_dump(config, config_in)

        os.chdir(root)
        loom.runner.generate(
            config_in=config_in,
            model_in=init_out,
            rows_out=rows_out,
            model_out=model_out,
            groups_out=groups_out,
            debug=debug,
            profile=profile)
Ejemplo n.º 42
0
def export_rows(encoding_in, rows_in, rows_csv_out, chunk_size=1000000):
    '''
    Export rows from protobuf stream to csv.
    '''
    for ext in ['.csv', '.gz', '.bz2']:
        assert not rows_csv_out.endswith(ext),\
            'rows_csv_out should be a dirname'
    assert chunk_size > 0
    encoders = json_load(encoding_in)
    fields = [loom.schema.MODEL_TO_DATATYPE[e['model']] for e in encoders]
    decoders = [load_decoder(e) for e in encoders]
    header = [e['name'] for e in encoders]
    if os.path.exists(rows_csv_out):
        shutil.rmtree(rows_csv_out)
    os.makedirs(rows_csv_out)
    rows = loom.cFormat.row_stream_load(rows_in)
    try:
        empty = None
        for i in xrange(MAX_CHUNK_COUNT):
            file_out = os.path.join(
                rows_csv_out,
                'rows_{:06d}.csv.gz'.format(i))
            with open_compressed(file_out, 'wb') as f:
                writer = csv.writer(f)
                writer.writerow(header)
                empty = file_out
                for j in xrange(chunk_size):
                    data = rows.next().iter_data()
                    schema = izip(data['observed'], fields, decoders)
                    row = [
                        decode(data[field].next()) if observed else ''
                        for observed, field, decode in schema
                    ]
                    writer.writerow(row)
                    empty = None
    except StopIteration:
        if empty:
            os.remove(empty)
Ejemplo n.º 43
0
    def relate(self, columns, result_out, sample_count=1000):
        """
        Compute pairwise related scores between all pairs of
        columns in columns.

        Related scores are defined to be:
            Related(X, Y) = I(X; Y) / H(X, Y)
        Where:
            I(X; Y) is the mutual information between X and Y:
                I(X; Y) = E[ log( p(x, y)) / ( p(x) p(y) ) ]; x, y ~ p(x, y)
            H(X) is the entropy of X:
                H(X) = E[ log( p(x) )]; x ~ p(x)
        Expectations are estimated via monte carlo with `sample_count` samples
        """
        with open_compressed(result_out, 'w') as f:
            writer = csv.writer(f)
            writer.writerow(self.feature_names)
            for target_column in set(columns):
                out_row = [target_column]
                to_sample1 = self.cols_to_sample([target_column])
                for to_relate in self.feature_names:
                    to_sample2 = self.cols_to_sample([to_relate])
                    mi = self.query_server.mutual_information(
                        to_sample1,
                        to_sample2,
                        sample_count=sample_count).mean
                    joined = [to_relate, target_column]
                    to_sample_both = self.cols_to_sample(joined)
                    joint_entropy = self.query_server.entropy(
                        to_sample_both,
                        sample_count=sample_count).mean
                    normalized_mi = self.normalize_mutual_information(
                        mi,
                        joint_entropy)
                    out_row.append(normalized_mi)
                writer.writerow(out_row)
Ejemplo n.º 44
0
def import_schema():
    schema = [line.strip() for line in open_compressed(VOCAB)]
    with open_compressed(SCHEMA_CSV, 'w') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(schema)
    return schema
Ejemplo n.º 45
0
def csv_writer(filename):
    with open_compressed(filename, 'wb') as f:
        yield csv.writer(f)
Ejemplo n.º 46
0
def pickle_dump(data, filename):
    with open_compressed(filename, 'wb') as f:
        pickle.dump(data, f)
Ejemplo n.º 47
0
def pickle_load(filename):
    with open_compressed(filename, 'rb') as f:
        return pickle.load(f)
Ejemplo n.º 48
0
def _load_checkpoint(step):
    message = loom.schema_pb2.Checkpoint()
    filename = checkpoint_files(step)['checkpoint']
    with open_compressed(filename) as f:
        message.ParseFromString(f.read())
    return message
Ejemplo n.º 49
0
def csv_input(arg):
    if hasattr(arg, 'read'):
        yield csv.reader(arg)
    else:
        with open_compressed(arg, 'rb') as infile:
            yield csv.reader(infile)
Ejemplo n.º 50
0
def get_example_requests(model, rows, query_type='mixed'):
    assert query_type in ['sample', 'score', 'mixed']
    cross_cat = CrossCat()
    with open_compressed(model, 'rb') as f:
        cross_cat.ParseFromString(f.read())
    feature_count = sum(len(kind.featureids) for kind in cross_cat.kinds)
    featureids = range(feature_count)

    nontrivials = [True] * feature_count
    for kind in cross_cat.kinds:
        fs = iter(kind.featureids)
        for model in loom.schema.MODELS.iterkeys():
            for shared in getattr(kind.product_model, model):
                f = fs.next()
                if model == 'dd':
                    if len(shared.alphas) == 0:
                        nontrivials[f] = False
                elif model == 'dpd':
                    if len(shared.betas) == 0:
                        nontrivials[f] = False
    all_observed = nontrivials[:]
    none_observed = [False] * feature_count

    observeds = []
    observeds.append(all_observed)
    for f, nontrivial in izip(featureids, nontrivials):
        if nontrivial:
            observed = all_observed[:]
            observed[f] = False
            observeds.append(observed)
    for f in featureids:
        observed = [
            nontrivial and sample_bernoulli(0.5)
            for nontrivial in nontrivials
        ]
        observeds.append(observed)
    for f, nontrivial in izip(featureids, nontrivials):
        if nontrivial:
            observed = none_observed[:]
            observed[f] = True
            observeds.append(observed)
    observeds.append(none_observed)

    requests = []
    for i, observed in enumerate(observeds):
        request = Query.Request()
        request.id = "example-{}".format(i)
        if query_type in ['sample', 'mixed']:
            set_diff(request.sample.data, none_observed)
            request.sample.to_sample.sparsity = DENSE
            request.sample.to_sample.dense[:] = observed
            request.sample.sample_count = 1
        if query_type in ['score', 'mixed']:
            set_diff(request.score.data, none_observed)
        requests.append(request)
    for row in load_rows(rows)[:20]:
        i += 1
        request = Query.Request()
        request.id = "example-{}".format(i)
        if query_type in ['sample', 'mixed']:
            request.sample.sample_count = 1
            request.sample.data.MergeFrom(row.diff)
            request.sample.to_sample.sparsity = DENSE
            conditions = izip(nontrivials, row.diff.pos.observed.dense)
            to_sample = [
                nontrivial and not is_observed
                for nontrivial, is_observed in conditions
            ]
            set_observed(request.sample.to_sample, to_sample)
        if query_type in ['score', 'mixed']:
            request.score.data.MergeFrom(row.diff)
        requests.append(request)
    return requests
Ejemplo n.º 51
0
def import_schema():
    schema = [line.strip() for line in open_compressed(VOCAB)]
    with open_compressed(SCHEMA_CSV, 'w') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(schema)
    return schema
Ejemplo n.º 52
0
def csv_reader(filename):
    with open_compressed(filename, 'rb') as f:
        yield csv.reader(f)
Ejemplo n.º 53
0
def csv_load(filename):
    with open_compressed(filename) as f:
        reader = csv.reader(f)
        return list(reader)
Ejemplo n.º 54
0
def generate_one((name, sample_count, force, debug)):
    paths = loom.store.get_paths(name, sample_count=sample_count)
    if not force and all(os.path.exists(f) for f in paths.itervalues()):
        with open_compressed(paths['ingest']['version']) as f:
            version = f.read().strip()
        if version == loom.__version__:
            return
    print 'generating', name
    mkdir_p(paths['root'])
    with open_compressed(paths['ingest']['version'], 'w') as f:
        f.write(loom.__version__)
    config = CONFIGS[name]
    chunk_size = max(10, (config['row_count'] + 7) / 8)
    loom.transforms.make_fake_transforms(
        transforms_out=paths['ingest']['transforms'])
    loom.generate.generate(
        init_out=paths['samples'][0]['init'],
        rows_out=paths['ingest']['rows'],
        model_out=paths['samples'][0]['model'],
        groups_out=paths['samples'][0]['groups'],
        assign_out=paths['samples'][0]['assign'],
        **config)
    loom.format.make_schema(
        model_in=paths['samples'][0]['model'],
        schema_out=paths['ingest']['schema'])
    loom.format.make_fake_encoding(
        schema_in=paths['ingest']['schema'],
        model_in=paths['samples'][0]['model'],
        encoding_out=paths['ingest']['encoding'])
    loom.format.make_schema_row(
        schema_in=paths['ingest']['schema'],
        schema_row_out=paths['ingest']['schema_row'])
    loom.runner.tare(
        schema_row_in=paths['ingest']['schema_row'],
        rows_in=paths['ingest']['rows'],
        tares_out=paths['ingest']['tares'],
        debug=debug)
    loom.runner.sparsify(
        schema_row_in=paths['ingest']['schema_row'],
        tares_in=paths['ingest']['tares'],
        rows_in=paths['ingest']['rows'],
        rows_out=paths['ingest']['diffs'],
        debug=debug)
    loom.format.export_rows(
        encoding_in=paths['ingest']['encoding'],
        rows_in=paths['ingest']['rows'],
        rows_csv_out=paths['ingest']['rows_csv'],
        chunk_size=chunk_size)
    loom.format.import_rowids(
        rows_csv_in=paths['ingest']['rows_csv'],
        rowids_out=paths['ingest']['rowids'],
        id_field='_id')
    protobuf_stream_dump([], paths['query']['query_log'])
    loom.config.config_dump({}, paths['query']['config'])
    for seed, sample in enumerate(paths['samples']):
        loom.config.config_dump({'seed': seed}, sample['config'])
        loom.generate.generate_init(
            encoding_in=paths['ingest']['encoding'],
            model_out=sample['init'],
            seed=seed)
        loom.runner.shuffle(
            rows_in=paths['ingest']['diffs'],
            rows_out=sample['shuffled'],
            seed=seed,
            debug=debug)
        protobuf_stream_dump([], sample['infer_log'])
    sample0 = paths['samples'][0]
    for seed, sample in enumerate(paths['samples'][1:]):
        if LOOM_DEBUG_MIX:
            cp_ns(sample0['model'], sample['model'])
            cp_ns(sample0['groups'], sample['groups'])
            cp_ns(sample0['assign'], sample['assign'])
        else:
            loom.runner.mix(
                config_in=sample['config'],
                rows_in=paths['ingest']['rows'],
                model_in=sample0['model'],
                groups_in=sample0['groups'],
                assign_in=sample0['assign'],
                model_out=sample['model'],
                groups_out=sample['groups'],
                assign_out=sample['assign'],
                debug=debug)
    loom.consensus.make_fake_consensus(
        paths=paths,
        debug=debug)
Ejemplo n.º 55
0
def get_example_requests(model, rows, query_type='mixed'):
    assert query_type in ['sample', 'score', 'mixed']
    cross_cat = CrossCat()
    with open_compressed(model, 'rb') as f:
        cross_cat.ParseFromString(f.read())
    feature_count = sum(len(kind.featureids) for kind in cross_cat.kinds)
    featureids = range(feature_count)

    nontrivials = [True] * feature_count
    for kind in cross_cat.kinds:
        fs = iter(kind.featureids)
        for model in loom.schema.MODELS.iterkeys():
            for shared in getattr(kind.product_model, model):
                f = fs.next()
                if model == 'dd':
                    if len(shared.alphas) == 0:
                        nontrivials[f] = False
                elif model == 'dpd':
                    if len(shared.betas) == 0:
                        nontrivials[f] = False
    all_observed = nontrivials[:]
    none_observed = [False] * feature_count

    observeds = []
    observeds.append(all_observed)
    for f, nontrivial in izip(featureids, nontrivials):
        if nontrivial:
            observed = all_observed[:]
            observed[f] = False
            observeds.append(observed)
    for f in featureids:
        observed = [
            nontrivial and sample_bernoulli(0.5)
            for nontrivial in nontrivials
        ]
        observeds.append(observed)
    for f, nontrivial in izip(featureids, nontrivials):
        if nontrivial:
            observed = none_observed[:]
            observed[f] = True
            observeds.append(observed)
    observeds.append(none_observed)

    requests = []
    for i, observed in enumerate(observeds):
        request = Query.Request()
        request.id = "example-{}".format(i)
        if query_type in ['sample', 'mixed']:
            set_diff(request.sample.data, none_observed)
            request.sample.to_sample.sparsity = DENSE
            request.sample.to_sample.dense[:] = observed
            request.sample.sample_count = 1
        if query_type in ['score', 'mixed']:
            set_diff(request.score.data, none_observed)
        requests.append(request)
    for row in load_rows(rows)[:20]:
        i += 1
        request = Query.Request()
        request.id = "example-{}".format(i)
        if query_type in ['sample', 'mixed']:
            request.sample.sample_count = 1
            request.sample.data.MergeFrom(row.diff)
            request.sample.to_sample.sparsity = DENSE
            conditions = izip(nontrivials, row.diff.pos.observed.dense)
            to_sample = [
                nontrivial and not is_observed
                for nontrivial, is_observed in conditions
            ]
            set_observed(request.sample.to_sample, to_sample)
        if query_type in ['score', 'mixed']:
            request.score.data.MergeFrom(row.diff)
        requests.append(request)
    return requests
Ejemplo n.º 56
0
def csv_dump(data, filename):
    with open_compressed(filename, 'w') as f:
        writer = csv.writer(f)
        for row in data:
            writer.writerow(row)