Beispiel #1
0
    model_interface = WithKWArgsWrapper(ModelInterface(),
                                        company_id=COMPANY_ID)
    datasource_interface = WithKWArgsWrapper(DatasourceController(),
                                             company_id=COMPANY_ID)
    raw_model_data_arr = model_interface.get_models()
    model_data_arr = []
    for model in raw_model_data_arr:
        if model['status'] == 'complete':
            x = model_interface.get_model_data(model['name'])
            try:
                model_data_arr.append(
                    model_interface.get_model_data(model['name']))
            except Exception:
                pass

    is_cloud = config.get('cloud', False)
    if not is_cloud:
        # region Mark old predictors as outdated
        is_modified = False
        predictor_records = db.session.query(db.Predictor).all()
        if len(predictor_records) > 0:
            sucess, compatible_versions = get_versions_where_predictors_become_obsolete(
            )
            if sucess is True:
                compatible_versions = [
                    version.parse(x) for x in compatible_versions
                ]
                mindsdb_version_parsed = version.parse(mindsdb_version)
                compatible_versions = [
                    x for x in compatible_versions
                    if x <= mindsdb_version_parsed
class ModelController():
    config: Config
    fs_store: FsStore
    predictor_cache: Dict[str, Dict[str, Union[Any]]]
    ray_based: bool

    def __init__(self, ray_based: bool) -> None:
        self.config = Config()
        self.fs_store = FsStore()
        self.predictor_cache = {}
        self.ray_based = ray_based

    def _invalidate_cached_predictors(self) -> None:
        # @TODO: Cache will become stale if the respective ModelInterface is not invoked yet a bunch of predictors remained cached, no matter where we invoke it. In practice shouldn't be a big issue though
        for predictor_name in list(self.predictor_cache.keys()):
            if (datetime.datetime.now() -
                    self.predictor_cache[predictor_name]['created']
                ).total_seconds() > 1200:
                del self.predictor_cache[predictor_name]

    def _lock_predictor(self, id: int, mode: str) -> None:
        from mindsdb.interfaces.storage.db import session, Semaphor

        while True:
            semaphor_record = session.query(Semaphor).filter_by(
                entity_id=id, entity_type='predictor').first()
            if semaphor_record is not None:
                if mode == 'read' and semaphor_record.action == 'read':
                    return True
            try:
                semaphor_record = Semaphor(entity_id=id,
                                           entity_type='predictor',
                                           action=mode)
                session.add(semaphor_record)
                session.commit()
                return True
            except Exception:
                pass
            time.sleep(1)

    def _unlock_predictor(self, id: int) -> None:
        from mindsdb.interfaces.storage.db import session, Semaphor
        semaphor_record = session.query(Semaphor).filter_by(
            entity_id=id, entity_type='predictor').first()
        if semaphor_record is not None:
            session.delete(semaphor_record)
            session.commit()

    @contextmanager
    def _lock_context(self, id, mode: str):
        try:
            self._lock_predictor(id, mode)
            yield True
        finally:
            self._unlock_predictor(id)

    def _get_from_data_df(self, from_data: dict) -> DataFrame:
        ds_cls = getattr(mindsdb_datasources, from_data['class'])
        ds = ds_cls(*from_data['args'], **from_data['kwargs'])
        return ds.df

    def _unpack_old_args(
        self,
        from_data: dict,
        kwargs: dict,
        to_predict: Optional[Union[str, list]] = None
    ) -> Tuple[pd.DataFrame, ProblemDefinition, bool]:
        problem_definition = kwargs or {}
        if isinstance(to_predict, str):
            problem_definition['target'] = to_predict
        elif isinstance(to_predict, list) and len(to_predict) == 1:
            problem_definition['target'] = to_predict[0]
        elif problem_definition.get('target') is None:
            raise Exception(
                f"Predict target must be 'str' or 'list' with 1 element. Got: {to_predict}"
            )

        join_learn_process = kwargs.get('join_learn_process', False)
        if 'join_learn_process' in kwargs:
            del kwargs['join_learn_process']

        # Adapt kwargs to problem definition
        if 'timeseries_settings' in kwargs:
            problem_definition['timeseries_settings'] = kwargs[
                'timeseries_settings']

        if 'stop_training_in_x_seconds' in kwargs:
            problem_definition['time_aim'] = kwargs[
                'stop_training_in_x_seconds']

        if kwargs.get('ignore_columns') is not None:
            problem_definition['ignore_features'] = kwargs['ignore_columns']

        if (problem_definition.get('ignore_features') is not None
                and isinstance(problem_definition['ignore_features'],
                               list) is False):
            problem_definition['ignore_features'] = [
                problem_definition['ignore_features']
            ]

        df = self._get_from_data_df(from_data)

        return df, problem_definition, join_learn_process

    @mark_process(name='learn')
    def learn(self,
              name: str,
              from_data: dict,
              to_predict: str,
              datasource_id: int,
              kwargs: dict,
              company_id: int,
              delete_ds_on_fail: Optional[bool] = False) -> None:
        predictor_record = db.session.query(db.Predictor).filter_by(
            company_id=company_id, name=name).first()
        if predictor_record is not None:
            raise Exception('Predictor name must be unique.')

        df, problem_definition, join_learn_process = self._unpack_old_args(
            from_data, kwargs, to_predict)

        problem_definition = ProblemDefinition.from_dict(problem_definition)
        predictor_record = db.Predictor(
            company_id=company_id,
            name=name,
            datasource_id=datasource_id,
            mindsdb_version=mindsdb_version,
            lightwood_version=lightwood_version,
            to_predict=problem_definition.target,
            learn_args=problem_definition.to_dict(),
            data={'name': name})

        db.session.add(predictor_record)
        db.session.commit()
        predictor_id = predictor_record.id

        p = LearnProcess(df, problem_definition, predictor_id,
                         delete_ds_on_fail)
        p.start()
        if join_learn_process:
            p.join()
            if not IS_PY36:
                p.close()
        db.session.refresh(predictor_record)

        data = {}
        if predictor_record.update_status == 'available':
            data['status'] = 'complete'
        elif predictor_record.json_ai is None and predictor_record.code is None:
            data['status'] = 'generating'
        elif predictor_record.data is None:
            data['status'] = 'editable'
        elif 'training_log' in predictor_record.data:
            data['status'] = 'training'
        elif 'error' not in predictor_record.data:
            data['status'] = 'complete'
        else:
            data['status'] = 'error'

    @mark_process(name='predict')
    def predict(self, name: str, when_data: Union[dict, list, pd.DataFrame],
                pred_format: str, company_id: int):
        original_name = name
        name = f'{company_id}@@@@@{name}'

        predictor_record = db.session.query(db.Predictor).filter_by(
            company_id=company_id, name=original_name).first()
        assert predictor_record is not None
        predictor_data = self.get_model_data(name, company_id)
        fs_name = f'predictor_{company_id}_{predictor_record.id}'

        if (name in self.predictor_cache
                and self.predictor_cache[name]['updated_at'] !=
                predictor_record.updated_at):
            del self.predictor_cache[name]

        if name not in self.predictor_cache:
            # Clear the cache entirely if we have less than 1.2 GB left
            if psutil.virtual_memory().available < 1.2 * pow(10, 9):
                self.predictor_cache = {}

            if predictor_data['status'] == 'complete':
                self.fs_store.get(fs_name, fs_name,
                                  self.config['paths']['predictors'])
                self.predictor_cache[name] = {
                    'predictor':
                    lightwood.predictor_from_state(
                        os.path.join(self.config['paths']['predictors'],
                                     fs_name), predictor_record.code),
                    'updated_at':
                    predictor_record.updated_at,
                    'created':
                    datetime.datetime.now(),
                    'code':
                    predictor_record.code,
                    'pickle':
                    str(
                        os.path.join(self.config['paths']['predictors'],
                                     fs_name))
                }
            else:
                raise Exception(
                    f'Trying to predict using predictor {original_name} with status: {predictor_data["status"]}. Error is: {predictor_data.get("error", "unknown")}'
                )

        if isinstance(when_data,
                      dict) and 'kwargs' in when_data and 'args' in when_data:
            ds_cls = getattr(mindsdb_datasources, when_data['class'])
            df = ds_cls(*when_data['args'], **when_data['kwargs']).df
        else:
            if isinstance(when_data, dict):
                when_data = [when_data]
            df = pd.DataFrame(when_data)

        predictions = self.predictor_cache[name]['predictor'].predict(df)
        predictions = predictions.to_dict(orient='records')
        # Bellow is useful for debugging caching and storage issues
        # del self.predictor_cache[name]

        target = predictor_record.to_predict[0]
        if pred_format in ('explain', 'dict', 'dict&explain'):
            explain_arr = []
            dict_arr = []
            for i, row in enumerate(predictions):
                explain_arr.append({
                    target: {
                        'predicted_value': row['prediction'],
                        'confidence': row.get('confidence', None),
                        'confidence_lower_bound': row.get('lower', None),
                        'confidence_upper_bound': row.get('upper', None),
                        'anomaly': row.get('anomaly', None),
                        'truth': row.get('truth', None)
                    }
                })

                td = {'predicted_value': row['prediction']}
                for col in df.columns:
                    if col in row:
                        td[col] = row[col]
                    elif f'order_{col}' in row:
                        td[col] = row[f'order_{col}']
                    elif f'group_{col}' in row:
                        td[col] = row[f'group_{col}']
                    else:
                        orginal_index = row.get('original_index')
                        if orginal_index is None:
                            log.warning('original_index is None')
                            orginal_index = i
                        td[col] = df.iloc[orginal_index][col]
                dict_arr.append({target: td})
            if pred_format == 'explain':
                return explain_arr
            elif pred_format == 'dict':
                return dict_arr
            elif pred_format == 'dict&explain':
                return dict_arr, explain_arr
        # New format -- Try switching to this in 2-3 months for speed, for now above is ok
        else:
            return predictions

    @mark_process(name='analyse')
    def analyse_dataset(self, ds: dict,
                        company_id: int) -> lightwood.DataAnalysis:
        ds_cls = getattr(mindsdb_datasources, ds['class'])
        df = ds_cls(*ds['args'], **ds['kwargs']).df
        analysis = lightwood.analyze_dataset(df)
        return analysis.to_dict()  # type: ignore

    def get_model_data(self, name, company_id: int):
        if '@@@@@' in name:
            sn = name.split('@@@@@')
            assert len(sn) < 3  # security
            name = sn[1]

        original_name = name
        name = f'{company_id}@@@@@{name}'

        predictor_record = db.session.query(db.Predictor).filter_by(
            company_id=company_id, name=original_name).first()
        assert predictor_record is not None

        linked_db_ds = db.session.query(db.Datasource).filter_by(
            company_id=company_id, id=predictor_record.datasource_id).first()

        data = deepcopy(predictor_record.data)
        data['dtype_dict'] = predictor_record.dtype_dict
        data['created_at'] = str(
            parse_datetime(str(predictor_record.created_at).split('.')[0]))
        data['updated_at'] = str(
            parse_datetime(str(predictor_record.updated_at).split('.')[0]))
        data['predict'] = predictor_record.to_predict[0]
        data['update'] = predictor_record.update_status
        data['mindsdb_version'] = predictor_record.mindsdb_version
        data['name'] = predictor_record.name
        data['code'] = predictor_record.code
        data['json_ai'] = predictor_record.json_ai
        data['data_source_name'] = linked_db_ds.name if linked_db_ds else None
        data['problem_definition'] = predictor_record.learn_args

        # assume older models are complete, only temporary
        if 'error' in predictor_record.data:
            data['status'] = 'error'
        elif predictor_record.update_status == 'available':
            data['status'] = 'complete'
        elif predictor_record.json_ai is None and predictor_record.code is None:
            data['status'] = 'generating'
        elif predictor_record.data is None:
            data['status'] = 'editable'
        elif 'training_log' in predictor_record.data:
            data['status'] = 'training'
        elif 'error' not in predictor_record.data:
            data['status'] = 'complete'
        else:
            data['status'] = 'error'

        if data.get('accuracies', None) is not None:
            if len(data['accuracies']) > 0:
                data['accuracy'] = float(
                    np.mean(list(data['accuracies'].values())))
        return data

    def get_model_description(self, name: str, company_id: int):
        """
        Similar to `get_model_data` but meant to be seen directly by the user, rather than parsed by something like the Studio predictor view.

        Uses `get_model_data` to compose this, but in the future we might want to make this independent if we deprected `get_model_data`

        :returns: Dictionary of the analysis (meant to be foramtted by the APIs and displayed as json/yml/whatever)
        """ # noqa
        model_description = {}
        model_data = self.get_model_data(name, company_id)

        model_description['accuracies'] = model_data['accuracies']
        model_description['column_importances'] = model_data[
            'column_importances']
        model_description['outputs'] = [model_data['predict']]
        model_description['inputs'] = [
            col for col in model_data['dtype_dict']
            if col not in model_description['outputs']
        ]
        model_description['datasource'] = model_data['data_source_name']
        model_description['model'] = ' --> '.join(
            str(k) for k in model_data['json_ai'])

        return model_description

    def get_models(self, company_id: int):
        models = []
        for db_p in db.session.query(
                db.Predictor).filter_by(company_id=company_id):
            model_data = self.get_model_data(db_p.name, company_id=company_id)
            reduced_model_data = {}

            for k in [
                    'name', 'version', 'is_active', 'predict', 'status',
                    'current_phase', 'accuracy', 'data_source', 'update',
                    'data_source_name', 'mindsdb_version', 'error'
            ]:
                reduced_model_data[k] = model_data.get(k, None)

            for k in ['train_end_at', 'updated_at', 'created_at']:
                reduced_model_data[k] = model_data.get(k, None)
                if reduced_model_data[k] is not None:
                    try:
                        reduced_model_data[k] = parse_datetime(
                            str(reduced_model_data[k]).split('.')[0])
                    except Exception as e:
                        # @TODO Does this ever happen
                        log.error(
                            f'Date parsing exception while parsing: {k} in get_models: ',
                            e)
                        reduced_model_data[k] = parse_datetime(
                            str(reduced_model_data[k]))

            models.append(reduced_model_data)
        return models

    def delete_model(self, name, company_id: int):
        original_name = name
        name = f'{company_id}@@@@@{name}'

        db_p = db.session.query(db.Predictor).filter_by(
            company_id=company_id, name=original_name).first()
        if db_p is None:
            raise Exception(f"Predictor '{name}' does not exist")
        db.session.delete(db_p)
        if db_p.datasource_id is not None:
            try:
                dataset_record = db.Datasource.query.get(db_p.datasource_id)
                if (isinstance(dataset_record.data, str) and json.loads(
                        dataset_record.data).get('source_type') != 'file'):
                    DataStore().delete_datasource(dataset_record.name,
                                                  company_id)
            except Exception:
                pass
        db.session.commit()

        DatabaseWrapper(company_id).unregister_predictor(name)

        # delete from s3
        self.fs_store.delete(f'predictor_{company_id}_{db_p.id}')

        return 0

    def rename_model(self, old_name, new_name, company_id: int):
        db_p = db.session.query(db.Predictor).filter_by(company_id=company_id,
                                                        name=old_name).first()
        db_p.name = new_name
        db.session.commit()
        dbw = DatabaseWrapper(company_id)
        dbw.unregister_predictor(old_name)
        dbw.register_predictors([self.get_model_data(new_name, company_id)])

    @mark_process(name='learn')
    def update_model(self, name: str, company_id: int):
        # TODO: Add version check here once we're done debugging
        predictor_record = db.session.query(db.Predictor).filter_by(
            company_id=company_id, name=name).first()
        assert predictor_record is not None
        predictor_record.update_status = 'updating'
        db.session.commit()

        p = UpdateProcess(name, company_id)
        p.start()
        return 'Updated in progress'

    @mark_process(name='learn')
    def generate_predictor(self, name: str, from_data: dict, datasource_id,
                           problem_definition_dict: dict,
                           join_learn_process: bool, company_id: int):
        predictor_record = db.session.query(db.Predictor).filter_by(
            company_id=company_id, name=name).first()
        if predictor_record is not None:
            raise Exception('Predictor name must be unique.')

        df, problem_definition, _ = self._unpack_old_args(
            from_data, problem_definition_dict)

        problem_definition = ProblemDefinition.from_dict(problem_definition)

        predictor_record = db.Predictor(
            company_id=company_id,
            name=name,
            datasource_id=datasource_id,
            mindsdb_version=mindsdb_version,
            lightwood_version=lightwood_version,
            to_predict=problem_definition.target,
            learn_args=problem_definition.to_dict(),
            data={'name': name})

        db.session.add(predictor_record)
        db.session.commit()
        predictor_id = predictor_record.id

        p = GenerateProcess(df, problem_definition, predictor_id)
        p.start()
        if join_learn_process:
            p.join()
            if not IS_PY36:
                p.close()
        db.session.refresh(predictor_record)

    def edit_json_ai(self, name: str, json_ai: dict, company_id=None):
        predictor_record = db.session.query(db.Predictor).filter_by(
            company_id=company_id, name=name).first()
        assert predictor_record is not None

        json_ai = lightwood.JsonAI.from_dict(json_ai)
        predictor_record.code = lightwood.code_from_json_ai(json_ai)
        predictor_record.json_ai = json_ai.to_dict()
        db.session.commit()

    def code_from_json_ai(self, json_ai: dict, company_id=None):
        json_ai = lightwood.JsonAI.from_dict(json_ai)
        code = lightwood.code_from_json_ai(json_ai)
        return code

    def edit_code(self, name: str, code: str, company_id=None):
        """Edit an existing predictor's code"""
        if self.config.get('cloud', False):
            raise Exception('Code editing prohibited on cloud')

        predictor_record = db.session.query(db.Predictor).filter_by(
            company_id=company_id, name=name).first()
        assert predictor_record is not None

        lightwood.predictor_from_code(code)
        predictor_record.code = code
        predictor_record.json_ai = None
        db.session.commit()

    @mark_process(name='learn')
    def fit_predictor(self, name: str, from_data: dict,
                      join_learn_process: bool, company_id: int) -> None:
        predictor_record = db.session.query(db.Predictor).filter_by(
            company_id=company_id, name=name).first()
        assert predictor_record is not None

        df = self._get_from_data_df(from_data)
        p = FitProcess(predictor_record.id, df)
        p.start()
        if join_learn_process:
            p.join()
            if not IS_PY36:
                p.close()
Beispiel #3
0
    if os.path.isdir(root_storage_dir) is False:
        os.makedirs(root_storage_dir)

    if 'storage_db' in user_config:
        os.environ['MINDSDB_DB_CON'] = user_config['storage_db']
    elif os.environ.get('MINDSDB_DB_CON', '') == '':
        os.environ['MINDSDB_DB_CON'] = 'sqlite:///' + os.path.join(
            os.environ['MINDSDB_STORAGE_DIR'],
            'mindsdb.sqlite3.db') + '?check_same_thread=False&timeout=30'

    from mindsdb.utilities.config import Config
    mindsdb_config = Config()
    create_dirs_recursive(mindsdb_config['paths'])

    os.environ['DEFAULT_LOG_LEVEL'] = os.environ.get('DEFAULT_LOG_LEVEL',
                                                     'ERROR')
    os.environ['LIGHTWOOD_LOG_LEVEL'] = os.environ.get('LIGHTWOOD_LOG_LEVEL',
                                                       'ERROR')
    os.environ['MINDSDB_STORAGE_PATH'] = mindsdb_config['paths']['predictors']

    if telemetry_file_exists(mindsdb_config['storage_dir']):
        os.environ['CHECK_FOR_UPDATES'] = '0'
        print('\n x telemetry disabled! \n')
    elif os.getenv('CHECK_FOR_UPDATES', '1').lower() in [
            '0', 'false', 'False'
    ] or mindsdb_config.get('cloud', False):
        disable_telemetry(mindsdb_config['storage_dir'])
        print('\n x telemetry disabled \n')
    else:
        print('\n ✓ telemetry enabled \n')
    def run(self):
        '''
        running at subprocess due to
        ValueError: signal only works in main thread

        this is work for celery worker here?
        '''
        import sys
        import mindsdb_native

        from mindsdb.utilities.config import Config

        name, from_data, to_predict, kwargs, config, trx_type = self._args
        config = Config(config)

        mdb = mindsdb_native.Predictor(name=name)

        if trx_type == 'learn':
            data_source = getattr(mindsdb_native,
                                  from_data['class'])(*from_data['args'],
                                                      **from_data['kwargs'])

            kwargs['use_gpu'] = config.get('use_gpu', None)
            mdb.learn(from_data=data_source, to_predict=to_predict, **kwargs)

            stats = mdb.get_model_data()['data_analysis_v2']

            try:
                assert (config['integrations']['default_clickhouse']['enabled']
                        == True)
                from mindsdb.interfaces.clickhouse.clickhouse import Clickhouse
                clickhouse = Clickhouse(config)
                clickhouse.register_predictor(name, stats)
            except:
                pass

            try:
                assert (config['integrations']['default_mariadb']['enabled'] ==
                        True)
                from mindsdb.interfaces.mariadb.mariadb import Mariadb
                mariadb = Mariadb(config)
                mariadb.register_predictor(name, stats)
            except:
                pass

        if trx_type == 'predict':
            if isinstance(from_data, dict):
                when = from_data
                when_data = None
            else:
                when_data = getattr(mindsdb_native,
                                    from_data['class'])(*from_data['args'],
                                                        **from_data['kwargs'])
                when = None

            kwargs['use_gpu'] = config.get('use_gpu', None)

            predictions = mdb.predict(when=when,
                                      when_data=when_data,
                                      run_confidence_variation_analysis=True,
                                      **kwargs)

            return predictions