def run(self): ''' running at subprocess due to ValueError: signal only works in main thread this is work for celery worker here? ''' import mindsdb_native import setproctitle try: setproctitle.setproctitle('mindsdb_native_process') except Exception: pass config = Config() fs_store = FsSotre() company_id = os.environ.get('MINDSDB_COMPANY_ID', None) name, from_data, to_predict, kwargs, datasource_id = self._args mdb = mindsdb_native.Predictor(name=name, run_env={'trigger': 'mindsdb'}) predictor_record = Predictor.query.filter_by(company_id=company_id, name=name).first() predictor_record.datasource_id = datasource_id predictor_record.to_predict = to_predict predictor_record.version = mindsdb_native.__version__ predictor_record.data = { 'name': name, 'status': 'training' } #predictor_record.datasource_id = ... <-- can be done once `learn` is passed a datasource name session.commit() to_predict = to_predict if isinstance(to_predict, list) else [to_predict] data_source = getattr(mindsdb_native, from_data['class'])(*from_data['args'], **from_data['kwargs']) try: mdb.learn( from_data=data_source, to_predict=to_predict, **kwargs ) except Exception: pass fs_store.put(name, f'predictor_{company_id}_{predictor_record.id}', config['paths']['predictors']) model_data = mindsdb_native.F.get_model_data(name) predictor_record = Predictor.query.filter_by(company_id=company_id, name=name).first() predictor_record.data = model_data session.commit() DatabaseWrapper().register_predictors([model_data])
def run_learn(name, from_data, to_predict, kwargs, datasource_id): import mindsdb_native import mindsdb_datasources import mindsdb create_process_mark('learn') config = Config() fs_store = FsSotre() company_id = os.environ.get('MINDSDB_COMPANY_ID', None) mdb = mindsdb_native.Predictor(name=name, run_env={'trigger': 'mindsdb'}) predictor_record = Predictor.query.filter_by(company_id=company_id, name=name).first() predictor_record.datasource_id = datasource_id predictor_record.to_predict = to_predict predictor_record.native_version = mindsdb_native.__version__ predictor_record.mindsdb_version = mindsdb_version predictor_record.learn_args = {'to_predict': to_predict, 'kwargs': kwargs} predictor_record.data = {'name': name, 'status': 'training'} session.commit() to_predict = to_predict if isinstance(to_predict, list) else [to_predict] data_source = getattr(mindsdb_datasources, from_data['class'])(*from_data['args'], **from_data['kwargs']) try: mdb.learn(from_data=data_source, to_predict=to_predict, **kwargs) except Exception as e: log = logging.getLogger('mindsdb.main') log.error(f'Predictor learn error: {e}') predictor_record.data = {'name': name, 'status': 'error'} session.commit() delete_process_mark('learn') return fs_store.put(name, f'predictor_{company_id}_{predictor_record.id}', config['paths']['predictors']) model_data = mindsdb_native.F.get_model_data(name) predictor_record = Predictor.query.filter_by(company_id=company_id, name=name).first() predictor_record.data = model_data session.commit() DatabaseWrapper().register_predictors([model_data]) delete_process_mark('learn')
class DataStore(): def __init__(self): self.config = Config() self.fs_store = FsSotre() self.company_id = os.environ.get('MINDSDB_COMPANY_ID', None) self.dir = self.config.paths['datasources'] self.mindsdb_native = NativeInterface() def get_analysis(self, name): datasource_record = session.query(Datasource).filter_by( company_id=self.company_id, name=name).first() if datasource_record.analysis is None: datasource_record.analysis = json.dumps( self.mindsdb_native.analyse_dataset( self.get_datasource_obj(name))) session.commit() analysis = json.loads(datasource_record.analysis) return analysis def get_datasources(self, name=None): datasource_arr = [] if name is not None: datasource_record_arr = session.query(Datasource).filter_by( company_id=self.company_id, name=name) else: datasource_record_arr = session.query(Datasource).filter_by( company_id=self.company_id) for datasource_record in datasource_record_arr: try: datasource = json.loads(datasource_record.data) datasource['created_at'] = datasource_record.created_at datasource['updated_at'] = datasource_record.updated_at datasource['name'] = datasource_record.name datasource['id'] = datasource_record.id datasource_arr.append(datasource) except Exception as e: log.error(e) return datasource_arr def get_data(self, name, where=None, limit=None, offset=None): offset = 0 if offset is None else offset ds = self.get_datasource_obj(name) if limit is not None: # @TODO Add `offset` to the `filter` method of the datasource and get rid of `offset` filtered_ds = ds.filter(where=where, limit=limit + offset).iloc[offset:] else: filtered_ds = ds.filter(where=where) filtered_ds = filtered_ds.where(pd.notnull(filtered_ds), None) data = filtered_ds.to_dict(orient='records') return { 'data': data, 'rowcount': len(ds), 'columns_names': filtered_ds.columns } def get_datasource(self, name): datasource_arr = self.get_datasources(name) if len(datasource_arr) == 1: return datasource_arr[0] # @TODO: Remove when db swithc is more stable, this should never happen, but good santiy check while this is kinda buggy elif len(datasource_arr) > 1: log.error('Two or more datasource with the same name, (', len(datasource_arr), ') | Full list: ', datasource_arr) raise Exception('Two or more datasource with the same name') return None def delete_datasource(self, name): datasource_record = Datasource.query.filter_by( company_id=self.company_id, name=name).first() id = datasource_record.id session.delete(datasource_record) session.commit() self.fs_store.delete( f'datasource_{self.company_id}_{datasource_record.id}') try: shutil.rmtree(os.path.join(self.dir, name)) except Exception: pass def save_datasource(self, name, source_type, source, file_path=None): datasource_record = Datasource(company_id=self.company_id, name=name) if source_type == 'file' and (file_path is None): raise Exception( '`file_path` argument required when source_type == "file"') ds_meta_dir = os.path.join(self.dir, name) os.mkdir(ds_meta_dir) session.add(datasource_record) session.commit() datasource_record = session.query(Datasource).filter_by( company_id=self.company_id, name=name).first() try: if source_type == 'file': source = os.path.join(ds_meta_dir, source) shutil.move(file_path, source) ds = FileDS(source) creation_info = { 'class': 'FileDS', 'args': [source], 'kwargs': {} } elif source_type in self.config['integrations']: integration = self.config['integrations'][source_type] ds_class_map = { 'clickhouse': ClickhouseDS, 'mariadb': MariaDS, 'mysql': MySqlDS, 'postgres': PostgresDS, 'mssql': MSSQLDS, 'mongodb': MongoDS, 'snowflake': SnowflakeDS } try: dsClass = ds_class_map[integration['type']] except KeyError: raise KeyError( f"Unknown DS type: {source_type}, type is {integration['type']}" ) if integration['type'] in ['clickhouse']: creation_info = { 'class': dsClass.__name__, 'args': [], 'kwargs': { 'query': source['query'], 'user': integration['user'], 'password': integration['password'], 'host': integration['host'], 'port': integration['port'] } } ds = dsClass(**creation_info['kwargs']) elif integration['type'] in [ 'mssql', 'postgres', 'mariadb', 'mysql' ]: creation_info = { 'class': dsClass.__name__, 'args': [], 'kwargs': { 'query': source['query'], 'user': integration['user'], 'password': integration['password'], 'host': integration['host'], 'port': integration['port'] } } if 'database' in integration: creation_info['kwargs']['database'] = integration[ 'database'] if 'database' in source: creation_info['kwargs']['database'] = source[ 'database'] ds = dsClass(**creation_info['kwargs']) elif integration['type'] == 'snowflake': creation_info = { 'class': dsClass.__name__, 'args': [], 'kwargs': { 'query': source['query'], 'schema': source['schema'], 'warehouse': source['warehouse'], 'database': source['database'], 'host': integration['host'], 'password': integration['password'], 'user': integration['user'], 'account': integration['account'] } } ds = dsClass(**creation_info['kwargs']) elif integration['type'] == 'mongodb': if isinstance(source['find'], str): source['find'] = json.loads(source['find']) creation_info = { 'class': dsClass.__name__, 'args': [], 'kwargs': { 'database': source['database'], 'collection': source['collection'], 'query': source['find'], 'user': integration['user'], 'password': integration['password'], 'host': integration['host'], 'port': integration['port'] } } ds = dsClass(**creation_info['kwargs']) else: # This probably only happens for urls ds = FileDS(source) creation_info = { 'class': 'FileDS', 'args': [source], 'kwargs': {} } df = ds.df if '' in df.columns or len(df.columns) != len(set(df.columns)): shutil.rmtree(ds_meta_dir) raise Exception( 'Each column in datasource must have unique non-empty name' ) datasource_record.creation_info = json.dumps(creation_info) datasource_record.data = json.dumps({ 'source_type': source_type, 'source': source, 'row_count': len(df), 'columns': [dict(name=x) for x in list(df.keys())] }) self.fs_store.put( name, f'datasource_{self.company_id}_{datasource_record.id}', self.dir) except Exception: if os.path.isdir(ds_meta_dir): shutil.rmtree(ds_meta_dir) raise session.commit() return self.get_datasource_obj(name, raw=True), name def get_datasource_obj(self, name, raw=False): try: datasource_record = session.query(Datasource).filter_by( company_id=self.company_id, name=name).first() self.fs_store.get( name, f'datasource_{self.company_id}_{datasource_record.id}', self.dir) creation_info = json.loads(datasource_record.creation_info) if raw: return creation_info else: return eval(creation_info['class'])(*creation_info['args'], **creation_info['kwargs']) except Exception as e: log.error(f'\n{e}\n') return None
class CustomModels(): def __init__(self): self.config = Config() self.fs_store = FsSotre() self.company_id = os.environ.get('MINDSDB_COMPANY_ID', None) self.dbw = DatabaseWrapper() self.storage_dir = self.config['paths']['custom_models'] os.makedirs(self.storage_dir, exist_ok=True) self.model_cache = {} self.mindsdb_native = NativeInterface() self.dbw = DatabaseWrapper() def _dir(self, name): return str(os.path.join(self.storage_dir, name)) def _internal_load(self, name): self.fs_store.get(name, f'custom_model_{self.company_id}_{name}', self.storage_dir) sys.path.insert(0, self._dir(name)) module = __import__(name) try: model = module.Model.load( os.path.join(self._dir(name), 'model.pickle')) except Exception as e: model = module.Model() model.initialize_column_types() if hasattr(model, 'setup'): model.setup() self.model_cache[name] = model return model def learn(self, name, from_data, to_predict, datasource_id, kwargs={}): model_data = self.get_model_data(name) model_data['status'] = 'training' self.save_model_data(name, model_data) to_predict = to_predict if isinstance(to_predict, list) else [to_predict] data_source = getattr(mindsdb_datasources, from_data['class'])(*from_data['args'], **from_data['kwargs']) data_frame = data_source.df model = self._internal_load(name) model.to_predict = to_predict model_data = self.get_model_data(name) model_data['predict'] = model.to_predict self.save_model_data(name, model_data) data_analysis = self.mindsdb_native.analyse_dataset( data_source)['data_analysis_v2'] model_data = self.get_model_data(name) model_data['data_analysis_v2'] = data_analysis self.save_model_data(name, model_data) model.fit(data_frame, to_predict, data_analysis, kwargs) model.save(os.path.join(self._dir(name), 'model.pickle')) self.model_cache[name] = model model_data = self.get_model_data(name) model_data['status'] = 'completed' model_data['columns'] = list(data_analysis.keys()) self.save_model_data(name, model_data) self.fs_store.put(name, f'custom_model_{self.company_id}_{name}', self.storage_dir) self.dbw.unregister_predictor(name) self.dbw.register_predictors([self.get_model_data(name)]) def predict(self, name, when_data=None, from_data=None, kwargs=None): self.fs_store.get(name, f'custom_model_{self.company_id}_{name}', self.storage_dir) if kwargs is None: kwargs = {} if from_data is not None: if isinstance(from_data, dict): data_source = getattr(mindsdb_datasources, from_data['class'])( *from_data['args'], **from_data['kwargs']) # assume that particular instance of any DataSource class is provided else: data_source = from_data data_frame = data_source.df elif when_data is not None: if isinstance(when_data, dict): for k in when_data: when_data[k] = [when_data[k]] data_frame = pd.DataFrame(when_data) else: data_frame = pd.DataFrame(when_data) model = self._internal_load(name) predictions = model.predict(data_frame, kwargs) pred_arr = [] for i in range(len(predictions)): pred_arr.append({}) pred_arr[-1] = {} for col in predictions.columns: pred_arr[-1][col] = {} pred_arr[-1][col]['predicted_value'] = predictions[col].iloc[i] return pred_arr def get_model_data(self, name): predictor_record = Predictor.query.filter_by( company_id=self.company_id, name=name, is_custom=True).first() return predictor_record.data def save_model_data(self, name, data): predictor_record = Predictor.query.filter_by( company_id=self.company_id, name=name, is_custom=True).first() if predictor_record is None: predictor_record = Predictor(company_id=self.company_id, name=name, is_custom=True, data=data) session.add(predictor_record) else: predictor_record.data = data session.commit() def get_models(self): predictor_names = [ x.name for x in Predictor.query.filter_by(company_id=self.company_id, is_custom=True) ] models = [] for name in predictor_names: models.append(self.get_model_data(name)) return models def delete_model(self, name): Predictor.query.filter_by(company_id=self.company_id, name=name, is_custom=True).delete() session.commit() shutil.rmtree(self._dir(name)) self.dbw.unregister_predictor(name) self.fs_store.delete(f'custom_model_{self.company_id}_{name}') def rename_model(self, name, new_name): self.fs_store.get(name, f'custom_model_{self.company_id}_{name}', self.storage_dir) self.dbw.unregister_predictor(name) shutil.move(self._dir(name), self._dir(new_name)) shutil.move(os.path.join(self._dir(new_name) + f'{name}.py'), os.path.join(self._dir(new_name), f'{new_name}.py')) predictor_record = Predictor.query.filter_by( company_id=self.company_id, name=name, is_custom=True).first() predictor_record.name = new_name session.commit() self.dbw.register_predictors([self.get_model_data(new_name)]) self.fs_store.put(name, f'custom_model_{self.company_id}_{new_name}', self.storage_dir) self.fs_store.delete(f'custom_model_{self.company_id}_{name}') def export_model(self, name): shutil.make_archive(base_name=name, format='zip', root_dir=self._dir(name)) return str(self._dir(name)) + '.zip' def load_model(self, fpath, name, trained_status): shutil.unpack_archive(fpath, self._dir(name), 'zip') shutil.move(os.path.join(self._dir(name), 'model.py'), os.path.join(self._dir(name), f'{name}.py')) model = self._internal_load(name) model.to_predict = model.to_predict if isinstance( model.to_predict, list) else [model.to_predict] self.save_model_data( name, { 'name': name, 'data_analysis_v2': model.column_type_map, 'predict': model.to_predict, 'status': trained_status, 'is_custom': True, 'columns': list(model.column_type_map.keys()) }) with open(os.path.join(self._dir(name), '__init__.py'), 'w') as fp: fp.write('') self.fs_store.put(name, f'custom_model_{self.company_id}_{name}', self.storage_dir) if trained_status == 'trained': self.dbw.register_predictors([self.get_model_data(name)])