Beispiel #1
0
    def uuid(self):
        if self._uuid is None:
            # The scenario when a dataset with the same name and fields
            # has more than a row in the storage is when different
            # models provide different dataset predictions/transformations.
            # This is solved by adding the history_uuid of transformations
            # into the data.UUID.
            self._uuid = uuid(
                (self.name_uuid() + self.history_uuid()).encode())

        return self._uuid
Beispiel #2
0
 def field_uuid(self, field_name):
     """
     UUID of the matrix/vector associated to the given field.
     :param field_name: Case sensitive.
     :return: UUID
     """
     key = 'uuid' + field_name
     if self.__dict__[key] is None:
         uuid_ = uuid(self.field_dump(field_name))
         self.__dict__[key] = uuid_
     return self.__dict__[key]
Beispiel #3
0
    def store_metadata(self, data: Data):
        """
        Intended to be used before Data is stored.
        :param data:
        :return:
        """
        # attr ---------------------------------------------------------
        # TODO: avoid sending long cols blob when unneeded
        cols = zlibext_pack(data.columns)
        uuid_cols = uuid(cols)
        sql = f'''
            insert or ignore into attr values (
                NULL,
                ?,
                ?
            );'''
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            self.query(sql, [uuid_cols, cols])

        # dataset ---------------------------------------------------------
        # TODO: avoid sending long names when unneeded
        sql = f'''
            insert or ignore into dataset values (
                NULL,
                ?,
                ?,
                ?
            );'''
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            self.query(sql, [data.name_uuid(), data.name, uuid_cols])

        # history ------------------------------------------------------
        # TODO: avoid sending long hist blob when unneeded
        sql = f'''
            insert or ignore into hist values (
                NULL,
                ?,
                ?
            )'''
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            self.query(sql, [data.history_uuid(), data.history_dump()])
Beispiel #4
0
    def get_data_by_name_impl(self, name, fields=None, history=None):
        """
        To just recover the original dataset you can pass history=None.
        Specify fields if you want to reduce traffic, otherwise all available
        fields will be fetched.

        ps. 1: Obviously, when getting prediction data (i.e., results),
         the history which led to the predictions should be provided.
        :param name:
        :param fields: None=get full Data; case insensitive; e.g. 'X,y,Z'
        :param history: nested tuples
        :param just_check_exists:
        :return:
        """
        hist_uuid = uuid(zlibext_pack(history))

        sql = f'''
                select 
                    X,Y,Z,P,U,V,W,Q,R,S,l,m,T,C,cols,des
                from 
                    data 
                        left join dataset on dataset=dsid 
                        left join attr on attr=aid
                where 
                    des=? and hist=?'''
        self.query(sql, [name, hist_uuid])
        row = self.get_one()
        if row is None:
            return None

        # Recover requested matrices/vectors.
        dic = {'name': name, 'history': history}
        if fields is None:
            flst = [k for k, v in row.items() if len(k) == 1 and v is not None]
        else:
            flst = fields.split(',')
        for field in flst:
            mid = row[field]
            if mid is not None:
                self.query(f'select val,w,h from mat where mid=?', [mid])
                rone = self.get_one()
                dic[field] = unpack_data(rone['val'], rone['w'], rone['h'])
        return Data(columns=zlibext_unpack(row['cols']), **dic)
Beispiel #5
0
    def __init__(self, config):
        self.config = config.copy()

        # 'mark' should not identify components, it only marks results.
        # 'mark', 'max_time', 'class', 'module' are reserved word
        self.mark = self.config.pop('mark') if 'mark' in config else None
        self.max_time = self.config.pop('max_time') \
            if 'max_time' in config else None

        # 'random_state' is reserved word
        if self.isdeterministic() and 'random_state' in self.config:
            del self.config['random_state']

        if 'class' not in config:
            self.config['class'] = self.__class__.__name__
        if 'module' not in config:
            self.config['module'] = self.__module__

        self._serialized = json_pack(self.config)
        self.uuid = uuid(self._serialized.encode())

        self.name = self.config['class']
        self.module = self.config['module']

        # a = Apply, u = Use
        self._modified = {'a': None, 'u': None}

        # self.model here refers to classifiers, preprocessors and, possibly,
        # some representation of pipelines or the autoML itself.
        # Another possibility is to generalize modules to a new class Model()
        # that has self.model.
        self.unfit = True
        self.model = None

        # Each apply() uses a different training data, so this uuid is mutable
        self._train_data_uuid__mutable = None

        self.locked_by_others = False
        self.failed = False
        self.time_spent = None
        self.host = None
        self.failure = None
        self._param = None
Beispiel #6
0
    def store_result_impl(self, component, input_data, output_data):
        """
        Store a result and remove lock.
        :param component:
        :param input_data:
        :param output_data:
        :return:
        """

        # Store resulting Data
        if output_data is not None:
            # We call impl here,
            # to avoid nested storage trying to do the same work twice.
            self.store_data_impl(output_data)

        # Remove lock and point result to data inserted above.
        # We should set all timestamp fields even if with the same old value,
        # due to automatic updates by DBMSs.
        # Data train was inserted and dtr was created when locking().
        now = self._now_function()

        # Store dump if requested.
        dump_uuid = uuid(
            (component.uuid + component.train_data_uuid__mutable()
             ).encode()) if self._dump else None
        if self._dump:
            sql = f'insert or ignore into inst values (null, ?, ?)'
            # pack_comp is nondeterministic and its result is big,
            # so we need to identify it by other, deterministic, means
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                self.query(sql, [dump_uuid, pack_comp(component)])

        # Store a log if apply() failed.
        log_uuid = component.failure and uuid(component.failure.encode())
        if component.failure is not None:
            sql = f'insert or ignore into log values (null, ?, ?, {now})'
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                self.query(sql, [log_uuid, component.failure])

        # Unlock and save result.
        fail = 1 if component.failed else 0
        sql = f'''
                update res set 
                    log=?,
                    dout=?, spent=?, inst=?,
                    fail=?,
                    start=start, end={now}, alive={now},
                    mark=?
                where
                    config=? and op=? and 
                    dtr=? and din=?
                '''
        set_args = [
            log_uuid, output_data and output_data.uuid(), component.time_spent,
            dump_uuid, fail, component.mark
        ]
        where_args = [
            component.uuid, component.op,
            component.train_data_uuid__mutable(),
            input_data.uuid()
        ]
        # TODO: is there any important exception to handle here?
        self.query(sql, set_args + where_args)
        print(self.name, 'Stored!\n')
Beispiel #7
0
 def history_uuid(self):
     if self._history_uuid is None:
         self._history_uuid = uuid(self.history_dump())
     return self._history_uuid
Beispiel #8
0
 def name_uuid(self):
     if self._name_uuid is None:
         self._name_uuid = uuid(self.name.encode())
     return self._name_uuid