Esempio n. 1
0
def _check_predictions(rows_in, result_out, encoding):
    encoders = json_load(encoding)
    name_to_encoder = {e['name']: load_encoder(e) for e in encoders}
    with open_compressed(rows_in, 'rb') as fin:
        with open(result_out, 'r') as fout:
            in_reader = csv.reader(fin)
            out_reader = csv.reader(fout)
            fnames = in_reader.next()
            out_reader.next()
            for in_row in in_reader:
                for i in range(COUNT):
                    out_row = out_reader.next()
                    bundle = zip(fnames, in_row, out_row)
                    for name, in_val, out_val in bundle:
                        if name == '_id':
                            assert_equal(in_val, out_val)
                            continue
                        encode = name_to_encoder[name]
                        observed = bool(in_val.strip())
                        if observed:
                            assert_almost_equal(
                                encode(in_val),
                                encode(out_val))
                        else:
                            assert_true(bool(out_val.strip()))
Esempio n. 2
0
def test_predict(root, rows_csv, encoding, **unused):
    COUNT = 10
    with tempdir(cleanup_on_error=CLEANUP_ON_ERROR):
        with loom.query.get_server(root, debug=True) as query_server:
            result_out = 'predictions_out.csv'
            rows_in = os.listdir(rows_csv)[0]
            rows_in = os.path.join(rows_csv, rows_in)
            encoders = json_load(encoding)
            name_to_encoder = {e['name']: load_encoder(e) for e in encoders}
            preql = loom.preql.PreQL(query_server, encoding)
            preql.predict(rows_in, COUNT, result_out, id_offset=False)
            with open_compressed(rows_in, 'rb') as fin:
                with open(result_out, 'r') as fout:
                    in_reader = csv.reader(fin)
                    out_reader = csv.reader(fout)
                    fnames = in_reader.next()
                    out_reader.next()
                    for in_row in in_reader:
                        for i in range(COUNT):
                            out_row = out_reader.next()
                            bundle = zip(fnames, in_row, out_row)
                            for name, in_val, out_val in bundle:
                                encode = name_to_encoder[name]
                                observed = bool(in_val.strip())
                                if observed:
                                    assert_almost_equal(
                                        encode(in_val),
                                        encode(out_val))
                                else:
                                    assert_true(bool(out_val.strip()))
Esempio n. 3
0
 def predict(self, rows_csv, count, result_out, id_offset=True):
     with open_compressed(rows_csv, 'rb') as fin:
         with open_compressed(result_out, 'w') as fout:
             reader = csv.reader(fin)
             writer = csv.writer(fout)
             feature_names = list(reader.next())
             writer.writerow(feature_names)
             name_to_pos = {name: i for i, name in enumerate(feature_names)}
             pos_to_decode = {}
             schema = []
             for encoder in self.encoders:
                 pos = name_to_pos.get(encoder['name'])
                 encode = load_encoder(encoder)
                 decode = load_decoder(encoder)
                 if pos is not None:
                     pos_to_decode[pos] = decode
                 schema.append((pos, encode))
             for row in reader:
                 conditioning_row = []
                 to_sample = []
                 if id_offset:
                     row_id = row.pop(0)
                 for pos, encode, in schema:
                     value = None if pos is None else row[pos].strip()
                     observed = bool(value)
                     to_sample.append((not observed))
                     if observed is False:
                         conditioning_row.append(None)
                     else:
                         conditioning_row.append(encode(value))
                 samples = self.query_server.sample(
                     to_sample,
                     conditioning_row,
                     count)
                 for sample in samples:
                     if id_offset:
                         out_row = [row_id]
                     else:
                         out_row = []
                     for name in feature_names:
                         pos = name_to_pos[name]
                         decode = pos_to_decode[pos]
                         val = sample[pos]
                         out_row.append(val)
                     writer.writerow(out_row)
Esempio n. 4
0
 def __init__(self, query_server, encoding=None, debug=False):
     self._paths = loom.store.get_paths(query_server.root)
     if encoding is None:
         encoding = self._paths['ingest']['encoding']
     self._query_server = query_server
     self._encoders = json_load(encoding)
     transforms = self._paths['ingest']['transforms']
     self._transform = loom.transforms.load_transforms(transforms)
     self._feature_names = [e['name'] for e in self._encoders]
     self._feature_set = frozenset(self._feature_names)
     self._name_to_pos = {
         name: i
         for i, name in enumerate(self._feature_names)
     }
     self._name_to_decode = {
         e['name']: load_decoder(e)
         for e in self._encoders
     }
     self._name_to_encode = {
         e['name']: load_encoder(e)
         for e in self._encoders
     }
     self._rowid_map = None
     self._debug = debug
Esempio n. 5
0
 def __init__(self, query_server, encoding=None, debug=False):
     self._paths = loom.store.get_paths(query_server.root)
     if encoding is None:
         encoding = self._paths['ingest']['encoding']
     self._query_server = query_server
     self._encoders = json_load(encoding)
     transforms = self._paths['ingest']['transforms']
     self._transform = loom.transforms.load_transforms(transforms)
     self._feature_names = [e['name'] for e in self._encoders]
     self._feature_set = frozenset(self._feature_names)
     self._name_to_pos = {
         name: i
         for i, name in enumerate(self._feature_names)
     }
     self._name_to_decode = {
         e['name']: load_decoder(e)
         for e in self._encoders
     }
     self._name_to_encode = {
         e['name']: load_encoder(e)
         for e in self._encoders
     }
     self._rowid_map = None
     self._debug = debug
Esempio n. 6
0
    'text': ['This is a text feature.', 'Hello World!'],
    'tags': ['', 'big_data machine_learning platform'],
}
for fluent_type, values in EXAMPLE_VALUES.items():
    EXAMPLE_VALUES['optional_{}'.format(fluent_type)] = [''] + values
EXAMPLE_VALUES['id'] = ['any unique string can serve as an id']

FLUENT_TO_BASIC = {
    'boolean': 'bb',
    'categorical': 'dd',
    'unbounded_categorical': 'dpd',
    'count': 'gp',
    'real': 'nich',
}

encode_bool = load_encoder({'model': 'bb'})
decode_bool = load_decoder({'model': 'bb'})


def get_row_dict(header, row):
    '''By convention, empty strings are omitted from the result dict.'''
    return {key: value for key, value in izip(header, row) if value}


class TransformSequence(object):
    def __init__(self, transforms):
        self.transforms = transforms

    def forward_set(self, feature_set):
        result = set(feature_set)
        for t in self.transforms:
Esempio n. 7
0
    'text': ['This is a text feature.', 'Hello World!'],
    'tags': ['', 'big_data machine_learning platform'],
}
for fluent_type, values in EXAMPLE_VALUES.items():
    EXAMPLE_VALUES['optional_{}'.format(fluent_type)] = [''] + values
EXAMPLE_VALUES['id'] = ['any unique string can serve as an id']

FLUENT_TO_BASIC = {
    'boolean': 'bb',
    'categorical': 'dd',
    'unbounded_categorical': 'dpd',
    'count': 'gp',
    'real': 'nich',
}

encode_bool = load_encoder({'model': 'bb'})
decode_bool = load_decoder({'model': 'bb'})


def get_row_dict(header, row):
    '''By convention, empty strings are omitted from the result dict.'''
    return {key: value for key, value in izip(header, row) if value}


class TransformSequence(object):
    def __init__(self, transforms):
        self.transforms = transforms

    def forward_set(self, feature_set):
        result = set(feature_set)
        for t in self.transforms: