def predict(self, rows_csv, count, result_out, id_offset=True): with open_compressed(rows_csv, 'rb') as fin: with open_compressed(result_out, 'w') as fout: reader = csv.reader(fin) writer = csv.writer(fout) feature_names = list(reader.next()) writer.writerow(feature_names) name_to_pos = {name: i for i, name in enumerate(feature_names)} pos_to_decode = {} schema = [] for encoder in self.encoders: pos = name_to_pos.get(encoder['name']) encode = load_encoder(encoder) decode = load_decoder(encoder) if pos is not None: pos_to_decode[pos] = decode schema.append((pos, encode)) for row in reader: conditioning_row = [] to_sample = [] if id_offset: row_id = row.pop(0) for pos, encode, in schema: value = None if pos is None else row[pos].strip() observed = bool(value) to_sample.append((not observed)) if observed is False: conditioning_row.append(None) else: conditioning_row.append(encode(value)) samples = self.query_server.sample( to_sample, conditioning_row, count) for sample in samples: if id_offset: out_row = [row_id] else: out_row = [] for name in feature_names: pos = name_to_pos[name] decode = pos_to_decode[pos] val = sample[pos] out_row.append(val) writer.writerow(out_row)
def __init__(self, query_server, encoding=None, debug=False): self._paths = loom.store.get_paths(query_server.root) if encoding is None: encoding = self._paths['ingest']['encoding'] self._query_server = query_server self._encoders = json_load(encoding) transforms = self._paths['ingest']['transforms'] self._transform = loom.transforms.load_transforms(transforms) self._feature_names = [e['name'] for e in self._encoders] self._feature_set = frozenset(self._feature_names) self._name_to_pos = { name: i for i, name in enumerate(self._feature_names) } self._name_to_decode = { e['name']: load_decoder(e) for e in self._encoders } self._name_to_encode = { e['name']: load_encoder(e) for e in self._encoders } self._rowid_map = None self._debug = debug
'tags': ['', 'big_data machine_learning platform'], } for fluent_type, values in EXAMPLE_VALUES.items(): EXAMPLE_VALUES['optional_{}'.format(fluent_type)] = [''] + values EXAMPLE_VALUES['id'] = ['any unique string can serve as an id'] FLUENT_TO_BASIC = { 'boolean': 'bb', 'categorical': 'dd', 'unbounded_categorical': 'dpd', 'count': 'gp', 'real': 'nich', } encode_bool = load_encoder({'model': 'bb'}) decode_bool = load_decoder({'model': 'bb'}) def get_row_dict(header, row): '''By convention, empty strings are omitted from the result dict.''' return {key: value for key, value in izip(header, row) if value} class TransformSequence(object): def __init__(self, transforms): self.transforms = transforms def forward_set(self, feature_set): result = set(feature_set) for t in self.transforms: t.forward_set(result)