Exemple #1
0
    def _process_(self, data: Data):
        def indices():
            if self._indices is None:
                self._indices = self.partitionings(data)[self.i][self.istep]
            return self._indices

        newmatrices = {}
        for f in self.fields:
            newmatrices[f] = (lambda f: lambda: data[f][indices()])(f)
        return data.update(self, **newmatrices)
Exemple #2
0
    def _process_(self, data):
        fields = data.field_funcs_m.copy()
        del fields[self.field]
        fields["changed"] = []

        uuids = data.uuids.copy()
        del uuids[self.field]
        uuids["changed"] = UUID(b"[]")

        return Data(data.uuid * self.uuid, uuids, data.history << self,
                    **fields)
Exemple #3
0
    def _process_(self, data: Data):
        # REMINDER: Binarize will do nothing to numeric datasets, but the uuid still needs to be predictable.
        # So, the provided Data object should be "processed" anyway.
        def func():
            data_nominal_idxs = nominal_idxs(data.Xt)
            encoder = OneHotEncoder()
            if len(data_nominal_idxs) > 0:
                nom = encoder.fit_transform(
                    data.X[:, data_nominal_idxs]).toarray()
                num = np.delete(data.X, data_nominal_idxs,
                                axis=1).astype(float)
                return np.column_stack((nom, num))
            return data.X

        return data.update(self, X=func)
Exemple #4
0
    def fetch(self, data, lock=False, lazy=True, ignorelock=False):
        # , recursive=True):   # TODO: pensar no include_empty=False se faz sentido
        """Fetch the data object fields on-demand.
         data: uuid string or a (probably still not fully evaluated) Data object."""
        data_id = data if isinstance(data, str) else data.id
        # lst = []
        # print("LOGGING:::  Fetching...", data_id)
        # while True:
        try:
            ret = self.getdata(data_id, include_empty=True)
        except LockedEntryException:
            if not ignorelock:
                raise None
            ret = None
            lock = False  # Already locked.

        if ret is None or not ret["uuids"]:
            # REMINDER: check_existence false porque pode ser um data vazio
            # [e é para o Cache funcionar mesmo que ele tenha sido interrompido]
            if lock and not self.lock(data_id, check_existence=False):
                raise Exception("Could not lock data:", data_id)
            return

        dic = ret["uuids"].copy()
        if ret["stream"]:
            dic["stream"] = None
        if ret["inner"]:
            dic["inner"] = ret["inner"]

        fields = {} if isinstance(data, str) else data.field_funcs_m
        for field, fid in list(dic.items()):
            if field == "inner":
                fields[field] = lambda: self.fetch(fid)
            elif field == "stream":
                fields[field] = lambda: self.fetchstream(data_id, lazy)
            elif field == "changed":
                fields[field] = unpack(self.getcontent(fid)) if isinstance(
                    data, str) else data.changed
            elif field not in ["inner"] and (isinstance(data, str)
                                             or field in data.changed):
                if lazy:
                    fields[field] = (lambda fid_: lambda: unpack(
                        self.getcontent(fid_)))(fid)
                else:
                    fields[field] = unpack(self.getcontent(fid))

            # Call each lambda by a friendly name.
            if lazy and field != "changed" and islazy(
                    fields[field]):  # and field in data.field_funcs_m:
                fields[field].__name__ = "_" + fields[
                    field].__name__ + "_from_storage_" + self.id

        if isinstance(data, str):
            # if lazy:
            #     print("Checar se lazy ainda não retorna histórico para data dado por uuid-string")  # <-- TODO?
            history = self.fetchhistory(data)
        else:
            history = data.history
        # print("LOGGING:::  > > > > > > > > > fetched?", data_id, ret)
        return Data(UUID(data_id),
                    {k: UUID(v)
                     for k, v in ret["uuids"].items()}, history, **fields)
Exemple #5
0
    def store(self, data: Data, unlock=False, ignoredup=False, lazy=False):
        """Store all Data object fields as soon as one of them is evaluated.

        # The sequence of queries is planned to minimize traffic and CPU load,
        # otherwise it would suffice to just send 'insert or ignore' of dumps.

        Parameters
        ----------
        data
            Data object to store.
        ignore_dup
            Whether to send the query anyway, ignoring errors due to already existent registries.

        Returns
        -------
        List of inserted (or hoped to be inserted for threaded storages) Data ids (only meaningful for Data objects with inner)

        Exception
        ---------
        DuplicateEntryException
        :param ignoredup:
        :param data:
        :param check_dup:
        :param recursive:
        """
        if not ignoredup and self.hasdata(data.id) and not unlock:
            raise DuplicateEntryException(data.id)

        # Embed lazy storers inside the Data object.
        lst = []

        def func(held_data, name, field_funcs, puts):
            def lamb():
                for k, v in field_funcs.items():
                    if islazy(v):
                        v = v()  # Cannot call data[k] due to infinite loop.
                    held_data.field_funcs_m[
                        k] = v  # The old value may not be lazy, but the new one can be due to this very lazystore.
                    id = held_data.uuids[k].id
                    if id in puts:
                        if k != "inner":
                            # TODO/REMINDER: exceptionally two datasets can have some equal contents, like Xd;
                            #   so we send it again while the hash is not based on content
                            self.putcontent(id,
                                            fpack(held_data, k),
                                            ignoredup=True)
                rows = [(held_data.id, fname, fuuid.id)
                        for fname, fuuid in held_data.uuids.items()
                        if fname != "inner"]
                self.putfields(rows)
                return held_data.field_funcs_m[name]

            return lamb

        streams = {}
        while True:
            # Fields.
            cids = [u.id for u in data.uuids.values()]
            missing = [cid for cid in cids if cid not in self.hascontent(cids)]
            if lazy:
                # All fields will be evaluated at the time one of them is called.
                field_funcs_copy = data.field_funcs_m.copy()
                for field in data.field_funcs_m:
                    if field == "stream":
                        # data.field_funcs_m["stream"] = map(
                        #     lambda d: self.store(d, unlock=True, lazy=False),
                        #     data.field_funcs_m["stream"]
                        # )
                        raise Exception(
                            "A lazy storage cannot handle streams for now.")
                    data.field_funcs_m[field] = func(data, field,
                                                     field_funcs_copy, missing)
                    data.field_funcs_m[field].__name__ = "_" + data.uuids[
                        field].id + "_to_storage_" + self.id
            else:
                for k, v in data.items():
                    if k == "stream" and data.hasstream:
                        # Consume stream, to be stored after putdata().
                        streams[data.id] = list(data.stream)
                    else:
                        if k != "inner" and k != "stream":
                            id = data.uuids[k].id
                            if id in missing:
                                content = fpack(data, k)
                                # TODO/REMINDER: exceptionally two datasets can have some equal contents, like Xd;
                                #   so we send it again while the hash is not based on content
                                self.putcontent(id, content, ignoredup=True)

            lst.append(data)
            if not data.hasinner:
                break
            data = data.inner

        for i, d in reversed(list(enumerate(lst))):
            if i == 0 and unlock:
                self.unlock(d.id)

            # History.
            ancestor_duuid = Root.uuid
            for step in list(d.history):
                # print("LOGGING:::  ssssssssSSSSSSSSSSSS", step.id)
                if not self.hasstep(step.id):
                    self.storestep(step)

                parent_uuid = ancestor_duuid
                ancestor_duuid = ancestor_duuid * step.uuid
                # Here, locked=NULL means 'placeholder', which can be updated in the future if the same data happens to be truly stored.
                # We assume it is faster to do a single insertignore than select+insert, hence ignoredup=True here.
                if ancestor_duuid == d.uuid:
                    hasstream, inner = d.hasstream, d.inner if d.hasinner else None
                else:
                    hasstream, inner = False, None
                self.putdata(ancestor_duuid.id,
                             step.id,
                             inner and inner.id,
                             hasstream,
                             parent_uuid.id,
                             None,
                             ignoredup=True)
                # TODO: adopt logging    print(datauuid, 3333333333333333333333333333333333333333)

            if lazy:
                # TODO: adopt logging    print(d.id, 7777777777777777777777777777777)
                pass
            else:
                if d.id in streams:
                    rows = []
                    for pos, streamed_data in enumerate(streams[d.id]):
                        self.store(streamed_data, ignoredup=True)
                        rows.append((d.id, str(pos), streamed_data.id))
                    if not rows:
                        raise Exception("Empty stream??")
                    self.putstream(rows, ignoredup=ignoredup)

                    # Return a new iterator in the place of the original stream.
                    d.field_funcs_m["stream"] = iter(streams[d.id])
                self.putfields([(d.id, fname, fuuid.id)
                                for fname, fuuid in d.uuids.items()],
                               ignoredup=True)
        return lst[0]
Exemple #6
0
 def _process_(self, data: Data):
     newX = lambda: self.model(data.inner).transform(data.X)
     return data.update(self, X=newX)
Exemple #7
0
 def _process_(self, data: Data):
     newr = lambda: np.array(
         [f(data, self.target, self.prediction) for f in self.selected])
     return data.update(self, R=newr)
Exemple #8
0
 def _process_(self, data: Data):
     matrices = {"Z": lambda: self.model(data.inner).predict(data.X)}
     if "probability" in self.config and self.config["probability"]:
         matrices["P"] = lambda: self.model(data.inner).predict_proba(data.X)
     return data.update(self, **matrices)