Exemple #1
    def _process_(self, data: Data):
        def indices():
            if self._indices is None:
                self._indices = self.partitionings(data)[self.i][self.istep]
            return self._indices

        newmatrices = {}
        for f in self.fields:
            newmatrices[f] = (lambda f: lambda: data[f][indices()])(f)
        return data.update(self, **newmatrices)
Exemple #2
    def _process_(self, data):
        fields = data.field_funcs_m.copy()
        del fields[self.field]
        fields["changed"] = []

        uuids = data.uuids.copy()
        del uuids[self.field]
        uuids["changed"] = UUID(b"[]")

        return Data(data.uuid * self.uuid, uuids, data.history << self,
Exemple #3
    def _process_(self, data: Data):
        # REMINDER: Binarize will do nothing to numeric datasets, but the uuid still needs to be predictable.
        # So, the provided Data object should be "processed" anyway.
        def func():
            data_nominal_idxs = nominal_idxs(data.Xt)
            encoder = OneHotEncoder()
            if len(data_nominal_idxs) > 0:
                nom = encoder.fit_transform(
                    data.X[:, data_nominal_idxs]).toarray()
                num = np.delete(data.X, data_nominal_idxs,
                return np.column_stack((nom, num))
            return data.X

        return data.update(self, X=func)
Exemple #4
    def fetch(self, data, lock=False, lazy=True, ignorelock=False):
        # , recursive=True):   # TODO: pensar no include_empty=False se faz sentido
        """Fetch the data object fields on-demand.
         data: uuid string or a (probably still not fully evaluated) Data object."""
        data_id = data if isinstance(data, str) else data.id
        # lst = []
        # print("LOGGING:::  Fetching...", data_id)
        # while True:
            ret = self.getdata(data_id, include_empty=True)
        except LockedEntryException:
            if not ignorelock:
                raise None
            ret = None
            lock = False  # Already locked.

        if ret is None or not ret["uuids"]:
            # REMINDER: check_existence false porque pode ser um data vazio
            # [e é para o Cache funcionar mesmo que ele tenha sido interrompido]
            if lock and not self.lock(data_id, check_existence=False):
                raise Exception("Could not lock data:", data_id)

        dic = ret["uuids"].copy()
        if ret["stream"]:
            dic["stream"] = None
        if ret["inner"]:
            dic["inner"] = ret["inner"]

        fields = {} if isinstance(data, str) else data.field_funcs_m
        for field, fid in list(dic.items()):
            if field == "inner":
                fields[field] = lambda: self.fetch(fid)
            elif field == "stream":
                fields[field] = lambda: self.fetchstream(data_id, lazy)
            elif field == "changed":
                fields[field] = unpack(self.getcontent(fid)) if isinstance(
                    data, str) else data.changed
            elif field not in ["inner"] and (isinstance(data, str)
                                             or field in data.changed):
                if lazy:
                    fields[field] = (lambda fid_: lambda: unpack(
                    fields[field] = unpack(self.getcontent(fid))

            # Call each lambda by a friendly name.
            if lazy and field != "changed" and islazy(
                    fields[field]):  # and field in data.field_funcs_m:
                fields[field].__name__ = "_" + fields[
                    field].__name__ + "_from_storage_" + self.id

        if isinstance(data, str):
            # if lazy:
            #     print("Checar se lazy ainda não retorna histórico para data dado por uuid-string")  # <-- TODO?
            history = self.fetchhistory(data)
            history = data.history
        # print("LOGGING:::  > > > > > > > > > fetched?", data_id, ret)
        return Data(UUID(data_id),
                    {k: UUID(v)
                     for k, v in ret["uuids"].items()}, history, **fields)
Exemple #5
    def store(self, data: Data, unlock=False, ignoredup=False, lazy=False):
        """Store all Data object fields as soon as one of them is evaluated.

        # The sequence of queries is planned to minimize traffic and CPU load,
        # otherwise it would suffice to just send 'insert or ignore' of dumps.

            Data object to store.
            Whether to send the query anyway, ignoring errors due to already existent registries.

        List of inserted (or hoped to be inserted for threaded storages) Data ids (only meaningful for Data objects with inner)

        :param ignoredup:
        :param data:
        :param check_dup:
        :param recursive:
        if not ignoredup and self.hasdata(data.id) and not unlock:
            raise DuplicateEntryException(data.id)

        # Embed lazy storers inside the Data object.
        lst = []

        def func(held_data, name, field_funcs, puts):
            def lamb():
                for k, v in field_funcs.items():
                    if islazy(v):
                        v = v()  # Cannot call data[k] due to infinite loop.
                        k] = v  # The old value may not be lazy, but the new one can be due to this very lazystore.
                    id = held_data.uuids[k].id
                    if id in puts:
                        if k != "inner":
                            # TODO/REMINDER: exceptionally two datasets can have some equal contents, like Xd;
                            #   so we send it again while the hash is not based on content
                                            fpack(held_data, k),
                rows = [(held_data.id, fname, fuuid.id)
                        for fname, fuuid in held_data.uuids.items()
                        if fname != "inner"]
                return held_data.field_funcs_m[name]

            return lamb

        streams = {}
        while True:
            # Fields.
            cids = [u.id for u in data.uuids.values()]
            missing = [cid for cid in cids if cid not in self.hascontent(cids)]
            if lazy:
                # All fields will be evaluated at the time one of them is called.
                field_funcs_copy = data.field_funcs_m.copy()
                for field in data.field_funcs_m:
                    if field == "stream":
                        # data.field_funcs_m["stream"] = map(
                        #     lambda d: self.store(d, unlock=True, lazy=False),
                        #     data.field_funcs_m["stream"]
                        # )
                        raise Exception(
                            "A lazy storage cannot handle streams for now.")
                    data.field_funcs_m[field] = func(data, field,
                                                     field_funcs_copy, missing)
                    data.field_funcs_m[field].__name__ = "_" + data.uuids[
                        field].id + "_to_storage_" + self.id
                for k, v in data.items():
                    if k == "stream" and data.hasstream:
                        # Consume stream, to be stored after putdata().
                        streams[data.id] = list(data.stream)
                        if k != "inner" and k != "stream":
                            id = data.uuids[k].id
                            if id in missing:
                                content = fpack(data, k)
                                # TODO/REMINDER: exceptionally two datasets can have some equal contents, like Xd;
                                #   so we send it again while the hash is not based on content
                                self.putcontent(id, content, ignoredup=True)

            if not data.hasinner:
            data = data.inner

        for i, d in reversed(list(enumerate(lst))):
            if i == 0 and unlock:

            # History.
            ancestor_duuid = Root.uuid
            for step in list(d.history):
                # print("LOGGING:::  ssssssssSSSSSSSSSSSS", step.id)
                if not self.hasstep(step.id):

                parent_uuid = ancestor_duuid
                ancestor_duuid = ancestor_duuid * step.uuid
                # Here, locked=NULL means 'placeholder', which can be updated in the future if the same data happens to be truly stored.
                # We assume it is faster to do a single insertignore than select+insert, hence ignoredup=True here.
                if ancestor_duuid == d.uuid:
                    hasstream, inner = d.hasstream, d.inner if d.hasinner else None
                    hasstream, inner = False, None
                             inner and inner.id,
                # TODO: adopt logging    print(datauuid, 3333333333333333333333333333333333333333)

            if lazy:
                # TODO: adopt logging    print(d.id, 7777777777777777777777777777777)
                if d.id in streams:
                    rows = []
                    for pos, streamed_data in enumerate(streams[d.id]):
                        self.store(streamed_data, ignoredup=True)
                        rows.append((d.id, str(pos), streamed_data.id))
                    if not rows:
                        raise Exception("Empty stream??")
                    self.putstream(rows, ignoredup=ignoredup)

                    # Return a new iterator in the place of the original stream.
                    d.field_funcs_m["stream"] = iter(streams[d.id])
                self.putfields([(d.id, fname, fuuid.id)
                                for fname, fuuid in d.uuids.items()],
        return lst[0]
Exemple #6
 def _process_(self, data: Data):
     newX = lambda: self.model(data.inner).transform(data.X)
     return data.update(self, X=newX)
Exemple #7
 def _process_(self, data: Data):
     newr = lambda: np.array(
         [f(data, self.target, self.prediction) for f in self.selected])
     return data.update(self, R=newr)
Exemple #8
 def _process_(self, data: Data):
     matrices = {"Z": lambda: self.model(data.inner).predict(data.X)}
     if "probability" in self.config and self.config["probability"]:
         matrices["P"] = lambda: self.model(data.inner).predict_proba(data.X)
     return data.update(self, **matrices)