def _process_(self, data: Data): def indices(): if self._indices is None: self._indices = self.partitionings(data)[self.i][self.istep] return self._indices newmatrices = {} for f in self.fields: newmatrices[f] = (lambda f: lambda: data[f][indices()])(f) return data.update(self, **newmatrices)
def _process_(self, data): fields = data.field_funcs_m.copy() del fields[self.field] fields["changed"] = [] uuids = data.uuids.copy() del uuids[self.field] uuids["changed"] = UUID(b"[]") return Data(data.uuid * self.uuid, uuids, data.history << self, **fields)
def _process_(self, data: Data): # REMINDER: Binarize will do nothing to numeric datasets, but the uuid still needs to be predictable. # So, the provided Data object should be "processed" anyway. def func(): data_nominal_idxs = nominal_idxs(data.Xt) encoder = OneHotEncoder() if len(data_nominal_idxs) > 0: nom = encoder.fit_transform( data.X[:, data_nominal_idxs]).toarray() num = np.delete(data.X, data_nominal_idxs, axis=1).astype(float) return np.column_stack((nom, num)) return data.X return data.update(self, X=func)
def fetch(self, data, lock=False, lazy=True, ignorelock=False): # , recursive=True): # TODO: pensar no include_empty=False se faz sentido """Fetch the data object fields on-demand. data: uuid string or a (probably still not fully evaluated) Data object.""" data_id = data if isinstance(data, str) else data.id # lst = [] # print("LOGGING::: Fetching...", data_id) # while True: try: ret = self.getdata(data_id, include_empty=True) except LockedEntryException: if not ignorelock: raise None ret = None lock = False # Already locked. if ret is None or not ret["uuids"]: # REMINDER: check_existence false porque pode ser um data vazio # [e é para o Cache funcionar mesmo que ele tenha sido interrompido] if lock and not self.lock(data_id, check_existence=False): raise Exception("Could not lock data:", data_id) return dic = ret["uuids"].copy() if ret["stream"]: dic["stream"] = None if ret["inner"]: dic["inner"] = ret["inner"] fields = {} if isinstance(data, str) else data.field_funcs_m for field, fid in list(dic.items()): if field == "inner": fields[field] = lambda: self.fetch(fid) elif field == "stream": fields[field] = lambda: self.fetchstream(data_id, lazy) elif field == "changed": fields[field] = unpack(self.getcontent(fid)) if isinstance( data, str) else data.changed elif field not in ["inner"] and (isinstance(data, str) or field in data.changed): if lazy: fields[field] = (lambda fid_: lambda: unpack( self.getcontent(fid_)))(fid) else: fields[field] = unpack(self.getcontent(fid)) # Call each lambda by a friendly name. if lazy and field != "changed" and islazy( fields[field]): # and field in data.field_funcs_m: fields[field].__name__ = "_" + fields[ field].__name__ + "_from_storage_" + self.id if isinstance(data, str): # if lazy: # print("Checar se lazy ainda não retorna histórico para data dado por uuid-string") # <-- TODO? history = self.fetchhistory(data) else: history = data.history # print("LOGGING::: > > > > > > > > > fetched?", data_id, ret) return Data(UUID(data_id), {k: UUID(v) for k, v in ret["uuids"].items()}, history, **fields)
def store(self, data: Data, unlock=False, ignoredup=False, lazy=False): """Store all Data object fields as soon as one of them is evaluated. # The sequence of queries is planned to minimize traffic and CPU load, # otherwise it would suffice to just send 'insert or ignore' of dumps. Parameters ---------- data Data object to store. ignore_dup Whether to send the query anyway, ignoring errors due to already existent registries. Returns ------- List of inserted (or hoped to be inserted for threaded storages) Data ids (only meaningful for Data objects with inner) Exception --------- DuplicateEntryException :param ignoredup: :param data: :param check_dup: :param recursive: """ if not ignoredup and self.hasdata(data.id) and not unlock: raise DuplicateEntryException(data.id) # Embed lazy storers inside the Data object. lst = [] def func(held_data, name, field_funcs, puts): def lamb(): for k, v in field_funcs.items(): if islazy(v): v = v() # Cannot call data[k] due to infinite loop. held_data.field_funcs_m[ k] = v # The old value may not be lazy, but the new one can be due to this very lazystore. id = held_data.uuids[k].id if id in puts: if k != "inner": # TODO/REMINDER: exceptionally two datasets can have some equal contents, like Xd; # so we send it again while the hash is not based on content self.putcontent(id, fpack(held_data, k), ignoredup=True) rows = [(held_data.id, fname, fuuid.id) for fname, fuuid in held_data.uuids.items() if fname != "inner"] self.putfields(rows) return held_data.field_funcs_m[name] return lamb streams = {} while True: # Fields. cids = [u.id for u in data.uuids.values()] missing = [cid for cid in cids if cid not in self.hascontent(cids)] if lazy: # All fields will be evaluated at the time one of them is called. field_funcs_copy = data.field_funcs_m.copy() for field in data.field_funcs_m: if field == "stream": # data.field_funcs_m["stream"] = map( # lambda d: self.store(d, unlock=True, lazy=False), # data.field_funcs_m["stream"] # ) raise Exception( "A lazy storage cannot handle streams for now.") data.field_funcs_m[field] = func(data, field, field_funcs_copy, missing) data.field_funcs_m[field].__name__ = "_" + data.uuids[ field].id + "_to_storage_" + self.id else: for k, v in data.items(): if k == "stream" and data.hasstream: # Consume stream, to be stored after putdata(). streams[data.id] = list(data.stream) else: if k != "inner" and k != "stream": id = data.uuids[k].id if id in missing: content = fpack(data, k) # TODO/REMINDER: exceptionally two datasets can have some equal contents, like Xd; # so we send it again while the hash is not based on content self.putcontent(id, content, ignoredup=True) lst.append(data) if not data.hasinner: break data = data.inner for i, d in reversed(list(enumerate(lst))): if i == 0 and unlock: self.unlock(d.id) # History. ancestor_duuid = Root.uuid for step in list(d.history): # print("LOGGING::: ssssssssSSSSSSSSSSSS", step.id) if not self.hasstep(step.id): self.storestep(step) parent_uuid = ancestor_duuid ancestor_duuid = ancestor_duuid * step.uuid # Here, locked=NULL means 'placeholder', which can be updated in the future if the same data happens to be truly stored. # We assume it is faster to do a single insertignore than select+insert, hence ignoredup=True here. if ancestor_duuid == d.uuid: hasstream, inner = d.hasstream, d.inner if d.hasinner else None else: hasstream, inner = False, None self.putdata(ancestor_duuid.id, step.id, inner and inner.id, hasstream, parent_uuid.id, None, ignoredup=True) # TODO: adopt logging print(datauuid, 3333333333333333333333333333333333333333) if lazy: # TODO: adopt logging print(d.id, 7777777777777777777777777777777) pass else: if d.id in streams: rows = [] for pos, streamed_data in enumerate(streams[d.id]): self.store(streamed_data, ignoredup=True) rows.append((d.id, str(pos), streamed_data.id)) if not rows: raise Exception("Empty stream??") self.putstream(rows, ignoredup=ignoredup) # Return a new iterator in the place of the original stream. d.field_funcs_m["stream"] = iter(streams[d.id]) self.putfields([(d.id, fname, fuuid.id) for fname, fuuid in d.uuids.items()], ignoredup=True) return lst[0]
def _process_(self, data: Data): newX = lambda: self.model(data.inner).transform(data.X) return data.update(self, X=newX)
def _process_(self, data: Data): newr = lambda: np.array( [f(data, self.target, self.prediction) for f in self.selected]) return data.update(self, R=newr)
def _process_(self, data: Data): matrices = {"Z": lambda: self.model(data.inner).predict(data.X)} if "probability" in self.config and self.config["probability"]: matrices["P"] = lambda: self.model(data.inner).predict_proba(data.X) return data.update(self, **matrices)