Example #1
0
    def run(self):
        '''
        running at subprocess due to
        ValueError: signal only works in main thread

        this is work for celery worker here?
        '''
        import mindsdb_native
        import setproctitle

        try:
            setproctitle.setproctitle('mindsdb_native_process')
        except Exception:
            pass

        config = Config()
        fs_store = FsSotre()
        company_id = os.environ.get('MINDSDB_COMPANY_ID', None)
        name, from_data, to_predict, kwargs, datasource_id = self._args

        mdb = mindsdb_native.Predictor(name=name, run_env={'trigger': 'mindsdb'})

        predictor_record = Predictor.query.filter_by(company_id=company_id, name=name).first()
        predictor_record.datasource_id = datasource_id
        predictor_record.to_predict = to_predict
        predictor_record.version = mindsdb_native.__version__
        predictor_record.data = {
            'name': name,
            'status': 'training'
        }
        #predictor_record.datasource_id = ... <-- can be done once `learn` is passed a datasource name
        session.commit()

        to_predict = to_predict if isinstance(to_predict, list) else [to_predict]
        data_source = getattr(mindsdb_native, from_data['class'])(*from_data['args'], **from_data['kwargs'])

        try:
            mdb.learn(
                from_data=data_source,
                to_predict=to_predict,
                **kwargs
            )
        except Exception:
            pass

        fs_store.put(name, f'predictor_{company_id}_{predictor_record.id}', config['paths']['predictors'])

        model_data = mindsdb_native.F.get_model_data(name)

        predictor_record = Predictor.query.filter_by(company_id=company_id, name=name).first()
        predictor_record.data = model_data
        session.commit()

        DatabaseWrapper().register_predictors([model_data])
Example #2
0
def run_learn(name, from_data, to_predict, kwargs, datasource_id):
    import mindsdb_native
    import mindsdb_datasources
    import mindsdb

    create_process_mark('learn')

    config = Config()
    fs_store = FsSotre()

    company_id = os.environ.get('MINDSDB_COMPANY_ID', None)

    mdb = mindsdb_native.Predictor(name=name, run_env={'trigger': 'mindsdb'})

    predictor_record = Predictor.query.filter_by(company_id=company_id,
                                                 name=name).first()
    predictor_record.datasource_id = datasource_id
    predictor_record.to_predict = to_predict
    predictor_record.native_version = mindsdb_native.__version__
    predictor_record.mindsdb_version = mindsdb_version
    predictor_record.learn_args = {'to_predict': to_predict, 'kwargs': kwargs}
    predictor_record.data = {'name': name, 'status': 'training'}
    session.commit()

    to_predict = to_predict if isinstance(to_predict, list) else [to_predict]
    data_source = getattr(mindsdb_datasources,
                          from_data['class'])(*from_data['args'],
                                              **from_data['kwargs'])

    try:
        mdb.learn(from_data=data_source, to_predict=to_predict, **kwargs)
    except Exception as e:
        log = logging.getLogger('mindsdb.main')
        log.error(f'Predictor learn error: {e}')
        predictor_record.data = {'name': name, 'status': 'error'}
        session.commit()
        delete_process_mark('learn')
        return

    fs_store.put(name, f'predictor_{company_id}_{predictor_record.id}',
                 config['paths']['predictors'])

    model_data = mindsdb_native.F.get_model_data(name)

    predictor_record = Predictor.query.filter_by(company_id=company_id,
                                                 name=name).first()
    predictor_record.data = model_data
    session.commit()

    DatabaseWrapper().register_predictors([model_data])
    delete_process_mark('learn')
Example #3
0
class DataStore():
    def __init__(self):
        self.config = Config()

        self.fs_store = FsSotre()
        self.company_id = os.environ.get('MINDSDB_COMPANY_ID', None)
        self.dir = self.config.paths['datasources']
        self.mindsdb_native = NativeInterface()

    def get_analysis(self, name):
        datasource_record = session.query(Datasource).filter_by(
            company_id=self.company_id, name=name).first()
        if datasource_record.analysis is None:
            datasource_record.analysis = json.dumps(
                self.mindsdb_native.analyse_dataset(
                    self.get_datasource_obj(name)))
            session.commit()

        analysis = json.loads(datasource_record.analysis)
        return analysis

    def get_datasources(self, name=None):
        datasource_arr = []
        if name is not None:
            datasource_record_arr = session.query(Datasource).filter_by(
                company_id=self.company_id, name=name)
        else:
            datasource_record_arr = session.query(Datasource).filter_by(
                company_id=self.company_id)
        for datasource_record in datasource_record_arr:
            try:
                datasource = json.loads(datasource_record.data)
                datasource['created_at'] = datasource_record.created_at
                datasource['updated_at'] = datasource_record.updated_at
                datasource['name'] = datasource_record.name
                datasource['id'] = datasource_record.id
                datasource_arr.append(datasource)
            except Exception as e:
                log.error(e)
        return datasource_arr

    def get_data(self, name, where=None, limit=None, offset=None):
        offset = 0 if offset is None else offset
        ds = self.get_datasource_obj(name)

        if limit is not None:
            # @TODO Add `offset` to the `filter` method of the datasource and get rid of `offset`
            filtered_ds = ds.filter(where=where,
                                    limit=limit + offset).iloc[offset:]
        else:
            filtered_ds = ds.filter(where=where)

        filtered_ds = filtered_ds.where(pd.notnull(filtered_ds), None)
        data = filtered_ds.to_dict(orient='records')
        return {
            'data': data,
            'rowcount': len(ds),
            'columns_names': filtered_ds.columns
        }

    def get_datasource(self, name):
        datasource_arr = self.get_datasources(name)
        if len(datasource_arr) == 1:
            return datasource_arr[0]
        # @TODO: Remove when db swithc is more stable, this should never happen, but good santiy check while this is kinda buggy
        elif len(datasource_arr) > 1:
            log.error('Two or more datasource with the same name, (',
                      len(datasource_arr), ') | Full list: ', datasource_arr)
            raise Exception('Two or more datasource with the same name')
        return None

    def delete_datasource(self, name):
        datasource_record = Datasource.query.filter_by(
            company_id=self.company_id, name=name).first()
        id = datasource_record.id
        session.delete(datasource_record)
        session.commit()
        self.fs_store.delete(
            f'datasource_{self.company_id}_{datasource_record.id}')
        try:
            shutil.rmtree(os.path.join(self.dir, name))
        except Exception:
            pass

    def save_datasource(self, name, source_type, source, file_path=None):
        datasource_record = Datasource(company_id=self.company_id, name=name)

        if source_type == 'file' and (file_path is None):
            raise Exception(
                '`file_path` argument required when source_type == "file"')

        ds_meta_dir = os.path.join(self.dir, name)
        os.mkdir(ds_meta_dir)

        session.add(datasource_record)
        session.commit()
        datasource_record = session.query(Datasource).filter_by(
            company_id=self.company_id, name=name).first()

        try:
            if source_type == 'file':
                source = os.path.join(ds_meta_dir, source)
                shutil.move(file_path, source)
                ds = FileDS(source)

                creation_info = {
                    'class': 'FileDS',
                    'args': [source],
                    'kwargs': {}
                }

            elif source_type in self.config['integrations']:
                integration = self.config['integrations'][source_type]

                ds_class_map = {
                    'clickhouse': ClickhouseDS,
                    'mariadb': MariaDS,
                    'mysql': MySqlDS,
                    'postgres': PostgresDS,
                    'mssql': MSSQLDS,
                    'mongodb': MongoDS,
                    'snowflake': SnowflakeDS
                }

                try:
                    dsClass = ds_class_map[integration['type']]
                except KeyError:
                    raise KeyError(
                        f"Unknown DS type: {source_type}, type is {integration['type']}"
                    )

                if integration['type'] in ['clickhouse']:
                    creation_info = {
                        'class': dsClass.__name__,
                        'args': [],
                        'kwargs': {
                            'query': source['query'],
                            'user': integration['user'],
                            'password': integration['password'],
                            'host': integration['host'],
                            'port': integration['port']
                        }
                    }
                    ds = dsClass(**creation_info['kwargs'])

                elif integration['type'] in [
                        'mssql', 'postgres', 'mariadb', 'mysql'
                ]:
                    creation_info = {
                        'class': dsClass.__name__,
                        'args': [],
                        'kwargs': {
                            'query': source['query'],
                            'user': integration['user'],
                            'password': integration['password'],
                            'host': integration['host'],
                            'port': integration['port']
                        }
                    }

                    if 'database' in integration:
                        creation_info['kwargs']['database'] = integration[
                            'database']

                    if 'database' in source:
                        creation_info['kwargs']['database'] = source[
                            'database']

                    ds = dsClass(**creation_info['kwargs'])

                elif integration['type'] == 'snowflake':
                    creation_info = {
                        'class': dsClass.__name__,
                        'args': [],
                        'kwargs': {
                            'query': source['query'],
                            'schema': source['schema'],
                            'warehouse': source['warehouse'],
                            'database': source['database'],
                            'host': integration['host'],
                            'password': integration['password'],
                            'user': integration['user'],
                            'account': integration['account']
                        }
                    }

                    ds = dsClass(**creation_info['kwargs'])

                elif integration['type'] == 'mongodb':
                    if isinstance(source['find'], str):
                        source['find'] = json.loads(source['find'])
                    creation_info = {
                        'class': dsClass.__name__,
                        'args': [],
                        'kwargs': {
                            'database': source['database'],
                            'collection': source['collection'],
                            'query': source['find'],
                            'user': integration['user'],
                            'password': integration['password'],
                            'host': integration['host'],
                            'port': integration['port']
                        }
                    }

                    ds = dsClass(**creation_info['kwargs'])
            else:
                # This probably only happens for urls
                ds = FileDS(source)
                creation_info = {
                    'class': 'FileDS',
                    'args': [source],
                    'kwargs': {}
                }

            df = ds.df

            if '' in df.columns or len(df.columns) != len(set(df.columns)):
                shutil.rmtree(ds_meta_dir)
                raise Exception(
                    'Each column in datasource must have unique non-empty name'
                )

            datasource_record.creation_info = json.dumps(creation_info)
            datasource_record.data = json.dumps({
                'source_type':
                source_type,
                'source':
                source,
                'row_count':
                len(df),
                'columns': [dict(name=x) for x in list(df.keys())]
            })

            self.fs_store.put(
                name, f'datasource_{self.company_id}_{datasource_record.id}',
                self.dir)

        except Exception:
            if os.path.isdir(ds_meta_dir):
                shutil.rmtree(ds_meta_dir)
            raise

        session.commit()
        return self.get_datasource_obj(name, raw=True), name

    def get_datasource_obj(self, name, raw=False):
        try:
            datasource_record = session.query(Datasource).filter_by(
                company_id=self.company_id, name=name).first()
            self.fs_store.get(
                name, f'datasource_{self.company_id}_{datasource_record.id}',
                self.dir)
            creation_info = json.loads(datasource_record.creation_info)
            if raw:
                return creation_info
            else:
                return eval(creation_info['class'])(*creation_info['args'],
                                                    **creation_info['kwargs'])
        except Exception as e:
            log.error(f'\n{e}\n')
            return None
Example #4
0
class CustomModels():
    def __init__(self):
        self.config = Config()
        self.fs_store = FsSotre()
        self.company_id = os.environ.get('MINDSDB_COMPANY_ID', None)
        self.dbw = DatabaseWrapper()
        self.storage_dir = self.config['paths']['custom_models']
        os.makedirs(self.storage_dir, exist_ok=True)
        self.model_cache = {}
        self.mindsdb_native = NativeInterface()
        self.dbw = DatabaseWrapper()

    def _dir(self, name):
        return str(os.path.join(self.storage_dir, name))

    def _internal_load(self, name):
        self.fs_store.get(name, f'custom_model_{self.company_id}_{name}',
                          self.storage_dir)
        sys.path.insert(0, self._dir(name))
        module = __import__(name)

        try:
            model = module.Model.load(
                os.path.join(self._dir(name), 'model.pickle'))
        except Exception as e:
            model = module.Model()
            model.initialize_column_types()
            if hasattr(model, 'setup'):
                model.setup()

        self.model_cache[name] = model

        return model

    def learn(self, name, from_data, to_predict, datasource_id, kwargs={}):
        model_data = self.get_model_data(name)
        model_data['status'] = 'training'
        self.save_model_data(name, model_data)

        to_predict = to_predict if isinstance(to_predict,
                                              list) else [to_predict]

        data_source = getattr(mindsdb_datasources,
                              from_data['class'])(*from_data['args'],
                                                  **from_data['kwargs'])
        data_frame = data_source.df
        model = self._internal_load(name)
        model.to_predict = to_predict

        model_data = self.get_model_data(name)
        model_data['predict'] = model.to_predict
        self.save_model_data(name, model_data)

        data_analysis = self.mindsdb_native.analyse_dataset(
            data_source)['data_analysis_v2']

        model_data = self.get_model_data(name)
        model_data['data_analysis_v2'] = data_analysis
        self.save_model_data(name, model_data)

        model.fit(data_frame, to_predict, data_analysis, kwargs)

        model.save(os.path.join(self._dir(name), 'model.pickle'))
        self.model_cache[name] = model

        model_data = self.get_model_data(name)
        model_data['status'] = 'completed'
        model_data['columns'] = list(data_analysis.keys())
        self.save_model_data(name, model_data)
        self.fs_store.put(name, f'custom_model_{self.company_id}_{name}',
                          self.storage_dir)

        self.dbw.unregister_predictor(name)
        self.dbw.register_predictors([self.get_model_data(name)])

    def predict(self, name, when_data=None, from_data=None, kwargs=None):
        self.fs_store.get(name, f'custom_model_{self.company_id}_{name}',
                          self.storage_dir)
        if kwargs is None:
            kwargs = {}
        if from_data is not None:
            if isinstance(from_data, dict):
                data_source = getattr(mindsdb_datasources, from_data['class'])(
                    *from_data['args'], **from_data['kwargs'])
            # assume that particular instance of any DataSource class is provided
            else:
                data_source = from_data
            data_frame = data_source.df
        elif when_data is not None:
            if isinstance(when_data, dict):
                for k in when_data:
                    when_data[k] = [when_data[k]]
                data_frame = pd.DataFrame(when_data)
            else:
                data_frame = pd.DataFrame(when_data)

        model = self._internal_load(name)
        predictions = model.predict(data_frame, kwargs)

        pred_arr = []
        for i in range(len(predictions)):
            pred_arr.append({})
            pred_arr[-1] = {}
            for col in predictions.columns:
                pred_arr[-1][col] = {}
                pred_arr[-1][col]['predicted_value'] = predictions[col].iloc[i]

        return pred_arr

    def get_model_data(self, name):
        predictor_record = Predictor.query.filter_by(
            company_id=self.company_id, name=name, is_custom=True).first()
        return predictor_record.data

    def save_model_data(self, name, data):
        predictor_record = Predictor.query.filter_by(
            company_id=self.company_id, name=name, is_custom=True).first()
        if predictor_record is None:
            predictor_record = Predictor(company_id=self.company_id,
                                         name=name,
                                         is_custom=True,
                                         data=data)
            session.add(predictor_record)
        else:
            predictor_record.data = data
        session.commit()

    def get_models(self):
        predictor_names = [
            x.name
            for x in Predictor.query.filter_by(company_id=self.company_id,
                                               is_custom=True)
        ]
        models = []
        for name in predictor_names:
            models.append(self.get_model_data(name))

        return models

    def delete_model(self, name):
        Predictor.query.filter_by(company_id=self.company_id,
                                  name=name,
                                  is_custom=True).delete()
        session.commit()
        shutil.rmtree(self._dir(name))
        self.dbw.unregister_predictor(name)
        self.fs_store.delete(f'custom_model_{self.company_id}_{name}')

    def rename_model(self, name, new_name):
        self.fs_store.get(name, f'custom_model_{self.company_id}_{name}',
                          self.storage_dir)

        self.dbw.unregister_predictor(name)
        shutil.move(self._dir(name), self._dir(new_name))
        shutil.move(os.path.join(self._dir(new_name) + f'{name}.py'),
                    os.path.join(self._dir(new_name), f'{new_name}.py'))

        predictor_record = Predictor.query.filter_by(
            company_id=self.company_id, name=name, is_custom=True).first()
        predictor_record.name = new_name
        session.commit()

        self.dbw.register_predictors([self.get_model_data(new_name)])

        self.fs_store.put(name, f'custom_model_{self.company_id}_{new_name}',
                          self.storage_dir)
        self.fs_store.delete(f'custom_model_{self.company_id}_{name}')

    def export_model(self, name):
        shutil.make_archive(base_name=name,
                            format='zip',
                            root_dir=self._dir(name))
        return str(self._dir(name)) + '.zip'

    def load_model(self, fpath, name, trained_status):
        shutil.unpack_archive(fpath, self._dir(name), 'zip')
        shutil.move(os.path.join(self._dir(name), 'model.py'),
                    os.path.join(self._dir(name), f'{name}.py'))
        model = self._internal_load(name)
        model.to_predict = model.to_predict if isinstance(
            model.to_predict, list) else [model.to_predict]
        self.save_model_data(
            name, {
                'name': name,
                'data_analysis_v2': model.column_type_map,
                'predict': model.to_predict,
                'status': trained_status,
                'is_custom': True,
                'columns': list(model.column_type_map.keys())
            })

        with open(os.path.join(self._dir(name), '__init__.py'), 'w') as fp:
            fp.write('')

        self.fs_store.put(name, f'custom_model_{self.company_id}_{name}',
                          self.storage_dir)

        if trained_status == 'trained':
            self.dbw.register_predictors([self.get_model_data(name)])