def store_stream(self, stream): """Stories a created stream.""" stream_name = f"{self.name}_{stream.predictor}" stream_rec = Stream(name=stream_name, connection_params=self.connection_params, advanced_params=self.advanced_info, _type=stream._type, predictor=stream.predictor, integration=self.name, company_id=self.company_id, stream_in=stream.stream_in_name, stream_out=stream.stream_out_name) session.add(stream_rec) session.commit() self.streams[stream_name] = stream.stop_event
def _try_outdate_db_status(self, predictor_record): from mindsdb_native import __version__ as native_version from mindsdb import __version__ as mindsdb_version from mindsdb.interfaces.storage.db import session if predictor_record.update_status == 'update_failed': return predictor_record if predictor_record.native_version != native_version: predictor_record.update_status = 'available' if predictor_record.mindsdb_version != mindsdb_version: predictor_record.update_status = 'available' session.commit() return predictor_record
def emit(self, record): log_type = record.levelname source = f'file: {record.pathname} - line: {record.lineno}' payload = record.msg if telemtry_enabled: pass # @TODO: Enable once we are sure no sensitive info is being outputed in the logs # if log_type in ['INFO']: # add_breadcrumb( # category='auth', # message=str(payload), # level='info', # ) # Might be too much traffic if we send this for users with slow networks #if log_type in ['DEBUG']: # add_breadcrumb( # category='auth', # message=str(payload), # level='debug', # ) if log_type in ['ERROR', 'WARNING']: trace = str(traceback.format_stack(limit=20)) trac_log = Log(log_type='traceback', source=source, payload=trace, company_id=self.company_id) session.add(trac_log) session.commit() if telemtry_enabled: add_breadcrumb( category='stack_trace', message=trace, level='info', ) if log_type in ['ERROR']: capture_message(str(payload)) if log_type in ['WARNING']: capture_message(str(payload)) log = Log(log_type=str(log_type), source=source, payload=str(payload), company_id=self.company_id) session.add(log) session.commit()
def _lock_predictor(self, id, mode='write'): from mindsdb.interfaces.storage.db import session, Semaphor while True: semaphor_record = session.query(Semaphor).filter_by(company_id=self.company_id, entity_id=id, entity_type='predictor').first() if semaphor_record is not None: if mode == 'read' and semaphor_record.action == 'read': return True try: semaphor_record = Semaphor(company_id=self.company_id, entity_id=id, entity_type='predictor', action=mode) session.add(semaphor_record) session.commit() return True except Excpetion as e: pass time.sleep(1)
def store_stream(self, stream): """Stories a created stream.""" stream_name = f"{self.name}_{stream.predictor}" stream_rec = Stream(name=stream_name, host=stream.host, port=stream.port, db=stream.db, _type=stream._type, predictor=stream.predictor, integration=self.name, company_id=self.company_id, stream_in=stream.stream_in_name, stream_out=stream.stream_out_name) session.add(stream_rec) session.commit() self.streams[stream_name] = stream.stop_event
def run(self): ''' running at subprocess due to ValueError: signal only works in main thread this is work for celery worker here? ''' import mindsdb_native config = Config() fs_store = FsSotre() company_id = os.environ.get('MINDSDB_COMPANY_ID', None) name, from_data, to_predict, kwargs, datasource_id = self._args mdb = mindsdb_native.Predictor(name=name, run_env={'trigger': 'mindsdb'}) predictor_record = Predictor.query.filter_by(company_id=company_id, name=name).first() predictor_record.datasource_id = datasource_id predictor_record.to_predict = to_predict predictor_record.version = mindsdb_native.__version__ predictor_record.data = {'name': name, 'status': 'training'} #predictor_record.datasource_id = ... <-- can be done once `learn` is passed a datasource name session.commit() to_predict = to_predict if isinstance(to_predict, list) else [to_predict] data_source = getattr(mindsdb_native, from_data['class'])(*from_data['args'], **from_data['kwargs']) try: mdb.learn(from_data=data_source, to_predict=to_predict, **kwargs) except Exception as e: pass fs_store.put(name, f'predictor_{company_id}_{predictor_record.id}', config['paths']['predictors']) model_data = mindsdb_native.F.get_model_data(name) predictor_record = Predictor.query.filter_by(company_id=company_id, name=name).first() predictor_record.data = model_data session.commit() DatabaseWrapper().register_predictors([model_data])
def put(self, name): params = request.json.get('params') if not isinstance(params, dict): abort(400, "type of 'params' must be dict") for param in [ "predictor", "stream_in", "stream_out", "integration_name" ]: if param not in params: abort(400, f"'{param}' is missed.") integration_name = params['integration_name'] integration_info = get_integration(integration_name) if not integration_info: abort(400, f"integration '{integration_name}' doesn't exist.") if integration_info["type"] not in ['redis', 'kafka']: abort( 400, f"only integration of redis or kafka might be used to crate redis streams. got: '{integration_info.type}' type" ) connection_params = params.get('connect', {}) advanced_params = params.get('advanced', {}) predictor = params['predictor'] stream_in = params['stream_in'] stream_out = params['stream_out'] _type = params.get('type', 'forecast') if _type.lower() == StreamTypes.timeseries: ts_params = params.get('ts_params') else: ts_params = {} if predictor not in get_predictors(): abort( 400, f"requested predictor '{predictor}' is not ready or doens't exist" ) stream = StreamDB(_type=_type, name=name, connection_params=connection_params, advanced_params=advanced_params, predictor=predictor, stream_in=stream_in, stream_out=stream_out, integration=integration_name, company_id=COMPANY_ID, ts_params=ts_params) session.add(stream) session.commit() return {"status": "success", "stream_name": name}, 200
def run_learn_remote(df: DataFrame, predictor_id: int) -> None: try: serialized_df = json.dumps(df.to_dict()) predictor_record = Predictor.query.with_for_update().get(predictor_id) resp = requests.post(predictor_record.data['train_url'], json={ 'df': serialized_df, 'target': predictor_record.to_predict[0] }) assert resp.status_code == 200 predictor_record.data['status'] = 'complete' except Exception as e: predictor_record.data['status'] = 'error' predictor_record.data['error'] = str(resp.text) session.commit()
def put(self, name): params = request.json.get('params') if not isinstance(params, dict): abort(400, "type of 'params' must be dict") for param in [ "host", "port", "predictor", "stream_in", "stream_out", "integration_name" ]: if param not in params: abort(400, f"'{param}' is missed.") integration_name = params['integration_name'] integration_info = get_integration(integration_name) if not integration_info: abort(400, f"integration '{integration_name}' doesn't exist.") if integration_info.type != 'redis': abort( 400, f"only integration of redis type might be used to crate redis streams. got: '{integration_info.type}' type" ) host = integration_info['host'] port = integration_info['port'] db = integration_info.get('db', 0) predictor = params['predictor'] stream_in = params['stream_in'] stream_out = params['stream_out'] _type = params.get('type', 'forecast') if predictor not in get_predictors(): abort( 400, f"requested predictor '{predictor}' is not ready or doens't exist" ) stream = StreamDB(_type=_type, name=name, host=host, port=port, db=db, predictor=predictor, stream_in=stream_in, stream_out=stream_out, integration=integration_name, company_id=COMPANY_ID) session.add(stream) session.commit() return {"status": "success", "stream_name": name}, 200
def save_file(self, name, file_path, file_name=None, company_id=None): """ Save the file to our store Args: name (str): with that name file will be available in sql api file_name (str): file name file_path (str): path to the file company_id (int): company id Returns: int: id of 'file' record in db """ if file_name is None: file_name = Path(file_path).name try: ds_meta_dir = Path(self.dir).joinpath(f'{company_id}@@@@@{name}') ds_meta_dir.mkdir() source = ds_meta_dir.joinpath(file_name) shutil.move(file_path, str(source)) ds = FileDS(str(source)) ds_meta = self._get_ds_meta(ds) column_names = ds_meta['column_names'] if ds_meta['column_names'] is not None: column_names = json.dumps( [dict(name=x) for x in ds_meta['column_names']]) file_record = File(name=name, company_id=company_id, source_file_path=file_name, file_path=str(source), row_count=ds_meta['row_count'], columns=column_names) session.add(file_record) session.commit() self.fs_store.put(f'{company_id}@@@@@{name}', f'file_{company_id}_{file_record.id}', self.dir) except Exception as e: log.error(e) shutil.rmtree(ds_meta_dir) raise return file_record.id
def update_model(self, name): from mindsdb_native import F from mindsdb_worker.updater.update_model import update_model from mindsdb.interfaces.storage.db import session, Predictor from mindsdb.interfaces.datastore.datastore import DataStore try: predictor_record = Predictor.query.filter_by(company_id=self.company_id, name=name, is_custom=False).first() predictor_record.update_status = 'updating' session.commit() update_model(name, self.delete_model, F.delete_model, self.learn, self._lock_context, self.company_id, self.config['paths']['predictors'], predictor_record, self.fs_store, DataStore()) predictor_record = self._update_db_status(predictor_record) except Exception as e: log.error(e) predictor_record.update_status = 'update_failed' session.commit() return str(e)
def add(self, name, query, integration_name, company_id=None): integration_records = session.query(Integration).filter_by( company_id=company_id).all() integration_id = None for record in integration_records: if record.name == integration_name: integration_id = record.id break else: raise Exception( f"Can't find integration with name: {integration_name}") view_record = View(name=name, company_id=company_id, query=query, integration_id=integration_id) session.add(view_record) session.commit()
def get_model_data(self, name, db_fix=True): from mindsdb_native import F from mindsdb_native.libs.constants.mindsdb import DATA_SUBTYPES from mindsdb.interfaces.storage.db import session, Predictor predictor_record = Predictor.query.filter_by( company_id=self.company_id, name=name, is_custom=False).first() predictor_record = self._try_outdate_db_status(predictor_record) model = predictor_record.data if model is None or model['status'] == 'training': try: self.fs_store.get( name, f'predictor_{self.company_id}_{predictor_record.id}', self.config['paths']['predictors']) new_model_data = mindsdb_native.F.get_model_data(name) except Exception: new_model_data = None if predictor_record.data is None or ( new_model_data is not None and len(new_model_data) > len(predictor_record.data)): predictor_record.data = new_model_data model = new_model_data session.commit() # Make some corrections for databases not to break when dealing with empty columns if db_fix: data_analysis = model['data_analysis_v2'] for column in model['columns']: analysis = data_analysis.get(column) if isinstance(analysis, dict) and (len(analysis) == 0 or analysis.get( 'empty', {}).get('is_empty', False)): data_analysis[column]['typing'] = { 'data_subtype': DATA_SUBTYPES.INT } model['created_at'] = str( parse_datetime(str(predictor_record.created_at).split('.')[0])) model['updated_at'] = str( parse_datetime(str(predictor_record.updated_at).split('.')[0])) model['predict'] = predictor_record.to_predict model['update'] = predictor_record.update_status return self._pack(model)
def _setup_for_creation(self, name): from mindsdb_datasources import FileDS, ClickhouseDS, MariaDS, MySqlDS, PostgresDS, MSSQLDS, MongoDS, SnowflakeDS, AthenaDS import mindsdb_native from mindsdb_native import F from mindsdb_native.libs.constants.mindsdb import DATA_SUBTYPES from mindsdb.interfaces.storage.db import session, Predictor if name in self.predictor_cache: del self.predictor_cache[name] # Here for no particular reason, because we want to run this sometimes but not too often self._invalidate_cached_predictors() predictor_dir = Path(self.config.paths['predictors']).joinpath(name) create_directory(predictor_dir) predictor_record = Predictor(company_id=self.company_id, name=name, is_custom=False) session.add(predictor_record) session.commit()
def rename_model(self, name, new_name): self.fs_store.get(name, f'custom_model_{self.company_id}_{name}', self.storage_dir) self.dbw.unregister_predictor(name) shutil.move(self._dir(name), self._dir(new_name)) shutil.move(os.path.join(self._dir(new_name) + f'{name}.py'), os.path.join(self._dir(new_name), f'{new_name}.py')) predictor_record = Predictor.query.filter_by( company_id=self.company_id, name=name, is_custom=True).first() predictor_record.name = new_name session.commit() self.dbw.register_predictors([self.get_model_data(new_name)]) self.fs_store.put(name, f'custom_model_{self.company_id}_{new_name}', self.storage_dir) self.fs_store.delete(f'custom_model_{self.company_id}_{name}')
def run_learn(df: DataFrame, problem_definition: ProblemDefinition, predictor_id: int, delete_ds_on_fail: Optional[bool] = False) -> None: try: run_generate(df, problem_definition, predictor_id) run_fit(predictor_id, df) except Exception as e: predictor_record = Predictor.query.with_for_update().get(predictor_id) if delete_ds_on_fail is True: linked_db_ds = Datasource.query.filter_by( id=predictor_record.datasource_id).first() if linked_db_ds is not None: predictors_with_ds = Predictor.query.filter( (Predictor.id != predictor_id) & (Predictor.datasource_id == linked_db_ds.id)).all() if len(predictors_with_ds) == 0: session.delete(linked_db_ds) predictor_record.datasource_id = None predictor_record.data = {"error": str(e)} session.commit()
def delete_datasource(self, name, company_id=None): datasource_record = Datasource.query.filter_by(company_id=company_id, name=name).first() if not Config()["force_datasource_removing"]: linked_models = Predictor.query.filter_by( company_id=company_id, datasource_id=datasource_record.id).all() if linked_models: raise Exception( "Can't delete {} datasource because there are next models linked to it: {}" .format(name, [model.name for model in linked_models])) session.query(Semaphor).filter_by(company_id=company_id, entity_id=datasource_record.id, entity_type='datasource').delete() session.delete(datasource_record) session.commit() self.fs_store.delete(f'datasource_{company_id}_{datasource_record.id}') try: shutil.rmtree(os.path.join(self.dir, f'{company_id}@@@@@{name}')) except Exception: pass
def run_fit(predictor_id: int, df: pd.DataFrame) -> None: try: predictor_record = session.query(db.Predictor).filter_by(id=predictor_id).first() assert predictor_record is not None fs_store = FsStore() config = Config() predictor_record.data = {'training_log': 'training'} session.commit() predictor: lightwood.PredictorInterface = lightwood.predictor_from_code(predictor_record.code) predictor.learn(df) session.refresh(predictor_record) fs_name = f'predictor_{predictor_record.company_id}_{predictor_record.id}' pickle_path = os.path.join(config['paths']['predictors'], fs_name) predictor.save(pickle_path) fs_store.put(fs_name, fs_name, config['paths']['predictors']) predictor_record.data = predictor.model_analysis.to_dict() predictor_record.dtype_dict = predictor.dtype_dict session.commit() dbw = DatabaseWrapper(predictor_record.company_id) mi = ModelInterfaceWrapper(ModelInterface(), predictor_record.company_id) dbw.register_predictors([mi.get_model_data(predictor_record.name)]) except Exception as e: session.refresh(predictor_record) predictor_record.data = {'error': f'{traceback.format_exc()}\nMain error: {e}'} session.commit() raise e
def start_analysis(self, name): datasource_record = session.query(Datasource).filter_by( company_id=self.company_id, name=name).first() if datasource_record.analysis is not None: return None semaphor_record = session.query(Semaphor).filter_by( company_id=self.company_id, entity_id=datasource_record.id, entity_type='datasource').first() if semaphor_record is None: semaphor_record = Semaphor(company_id=self.company_id, entity_id=datasource_record.id, entity_type='datasource', action='write') session.add(semaphor_record) session.commit() else: return try: analysis = self.mindsdb_native.analyse_dataset( self.get_datasource_obj(name, raw=True)) datasource_record = session.query(Datasource).filter_by( company_id=self.company_id, name=name).first() datasource_record.analysis = json.dumps(analysis) session.commit() except Exception as e: log.error(e) finally: semaphor_record = session.query(Semaphor).filter_by( company_id=self.company_id, entity_id=datasource_record.id, entity_type='datasource').first() session.delete(semaphor_record) session.commit()
def get_analysis(self, name): datasource_record = session.query(Datasource).filter_by( company_id=self.company_id, name=name).first() if datasource_record.analysis is None: try: original_process_title = setproctitle.getproctitle() setproctitle.setproctitle('mindsdb_native_process') except Exception: pass analysis = self.mindsdb_native.analyse_dataset( self.get_datasource_obj(name)) datasource_record.analysis = json.dumps(analysis) session.commit() try: setproctitle.setproctitle(original_process_title) except Exception: pass analysis = json.loads(datasource_record.analysis) return analysis
def run_fit(predictor_id: int, df: pd.DataFrame) -> None: try: predictor_record = Predictor.query.with_for_update().get(predictor_id) assert predictor_record is not None fs_store = FsStore() config = Config() predictor_record.data = {'training_log': 'training'} session.commit() predictor: lightwood.PredictorInterface = lightwood.predictor_from_code( predictor_record.code) predictor.learn(df) session.refresh(predictor_record) fs_name = f'predictor_{predictor_record.company_id}_{predictor_record.id}' pickle_path = os.path.join(config['paths']['predictors'], fs_name) predictor.save(pickle_path) fs_store.put(fs_name, fs_name, config['paths']['predictors']) predictor_record.data = predictor.model_analysis.to_dict() # getting training time for each tried model. it is possible to do # after training only fit_mixers = list(predictor.runtime_log[x] for x in predictor.runtime_log if isinstance(x, tuple) and x[0] == "fit_mixer") submodel_data = predictor_record.data.get("submodel_data", []) # add training time to other mixers info if submodel_data and fit_mixers and len(submodel_data) == len( fit_mixers): for i, tr_time in enumerate(fit_mixers): submodel_data[i]["training_time"] = tr_time predictor_record.data["submodel_data"] = submodel_data predictor_record.dtype_dict = predictor.dtype_dict session.commit() dbw = DatabaseWrapper(predictor_record.company_id) mi = WithKWArgsWrapper(ModelInterface(), company_id=predictor_record.company_id) except Exception as e: session.refresh(predictor_record) predictor_record.data = { 'error': f'{traceback.format_exc()}\nMain error: {e}' } session.commit() raise e try: dbw.register_predictors([mi.get_model_data(predictor_record.name)]) except Exception as e: log.warn(e)
def run_learn(name, from_data, to_predict, kwargs, datasource_id): import mindsdb_native import mindsdb_datasources import mindsdb create_process_mark('learn') config = Config() fs_store = FsSotre() company_id = os.environ.get('MINDSDB_COMPANY_ID', None) mdb = mindsdb_native.Predictor(name=name, run_env={'trigger': 'mindsdb'}) predictor_record = Predictor.query.filter_by(company_id=company_id, name=name).first() predictor_record.datasource_id = datasource_id predictor_record.to_predict = to_predict predictor_record.native_version = mindsdb_native.__version__ predictor_record.mindsdb_version = mindsdb_version predictor_record.learn_args = {'to_predict': to_predict, 'kwargs': kwargs} predictor_record.data = {'name': name, 'status': 'training'} session.commit() to_predict = to_predict if isinstance(to_predict, list) else [to_predict] data_source = getattr(mindsdb_datasources, from_data['class'])(*from_data['args'], **from_data['kwargs']) try: mdb.learn(from_data=data_source, to_predict=to_predict, **kwargs) except Exception as e: log = logging.getLogger('mindsdb.main') log.error(f'Predictor learn error: {e}') predictor_record.data = {'name': name, 'status': 'error'} session.commit() delete_process_mark('learn') return fs_store.put(name, f'predictor_{company_id}_{predictor_record.id}', config['paths']['predictors']) model_data = mindsdb_native.F.get_model_data(name) predictor_record = Predictor.query.filter_by(company_id=company_id, name=name).first() predictor_record.data = model_data session.commit() DatabaseWrapper().register_predictors([model_data]) delete_process_mark('learn')
def start_analysis(self, name, company_id=None): dataset_record = session.query(Dataset).filter_by( company_id=company_id, name=name).first() if dataset_record.analysis_id is not None: return None semaphor_record = session.query(Semaphor).filter_by( company_id=company_id, entity_id=dataset_record.id, entity_type='dataset').first() if semaphor_record is None: semaphor_record = Semaphor(company_id=company_id, entity_id=dataset_record.id, entity_type='dataset', action='write') session.add(semaphor_record) session.commit() else: return try: analysis = self.model_interface.analyse_dataset( ds=self.get_datasource_obj(name, raw=True, company_id=company_id), company_id=company_id) dataset_record = session.query(Dataset).filter_by( company_id=company_id, name=name).first() analysis_record = Analysis( analysis=json.dumps(analysis, cls=CustomJSONEncoder)) session.add(analysis_record) session.flush() dataset_record.analysis_id = analysis_record.id session.commit() except Exception as e: log.error(e) finally: semaphor_record = session.query(Semaphor).filter_by( company_id=company_id, entity_id=dataset_record.id, entity_type='dataset').first() session.delete(semaphor_record) session.commit()
def save_datasource(self, name, source_type, source, file_path=None): datasource_record = Datasource(company_id=self.company_id, name=name) if source_type == 'file' and (file_path is None): raise Exception( '`file_path` argument required when source_type == "file"') ds_meta_dir = os.path.join(self.dir, name) os.mkdir(ds_meta_dir) session.add(datasource_record) session.commit() datasource_record = session.query(Datasource).filter_by( company_id=self.company_id, name=name).first() try: if source_type == 'file': source = os.path.join(ds_meta_dir, source) shutil.move(file_path, source) ds = FileDS(source) creation_info = { 'class': 'FileDS', 'args': [source], 'kwargs': {} } elif source_type in self.config['integrations']: integration = self.config['integrations'][source_type] ds_class_map = { 'clickhouse': ClickhouseDS, 'mariadb': MariaDS, 'mysql': MySqlDS, 'postgres': PostgresDS, 'mssql': MSSQLDS, 'mongodb': MongoDS, 'snowflake': SnowflakeDS } try: dsClass = ds_class_map[integration['type']] except KeyError: raise KeyError( f"Unknown DS type: {source_type}, type is {integration['type']}" ) if integration['type'] in ['clickhouse']: creation_info = { 'class': dsClass.__name__, 'args': [], 'kwargs': { 'query': source['query'], 'user': integration['user'], 'password': integration['password'], 'host': integration['host'], 'port': integration['port'] } } ds = dsClass(**creation_info['kwargs']) elif integration['type'] in [ 'mssql', 'postgres', 'mariadb', 'mysql' ]: creation_info = { 'class': dsClass.__name__, 'args': [], 'kwargs': { 'query': source['query'], 'user': integration['user'], 'password': integration['password'], 'host': integration['host'], 'port': integration['port'] } } if 'database' in integration: creation_info['kwargs']['database'] = integration[ 'database'] if 'database' in source: creation_info['kwargs']['database'] = source[ 'database'] ds = dsClass(**creation_info['kwargs']) elif integration['type'] == 'snowflake': creation_info = { 'class': dsClass.__name__, 'args': [], 'kwargs': { 'query': source['query'], 'schema': source['schema'], 'warehouse': source['warehouse'], 'database': source['database'], 'host': integration['host'], 'password': integration['password'], 'user': integration['user'], 'account': integration['account'] } } ds = dsClass(**creation_info['kwargs']) elif integration['type'] == 'mongodb': if isinstance(source['find'], str): source['find'] = json.loads(source['find']) creation_info = { 'class': dsClass.__name__, 'args': [], 'kwargs': { 'database': source['database'], 'collection': source['collection'], 'query': source['find'], 'user': integration['user'], 'password': integration['password'], 'host': integration['host'], 'port': integration['port'] } } ds = dsClass(**creation_info['kwargs']) else: # This probably only happens for urls ds = FileDS(source) creation_info = { 'class': 'FileDS', 'args': [source], 'kwargs': {} } df = ds.df if '' in df.columns or len(df.columns) != len(set(df.columns)): shutil.rmtree(ds_meta_dir) raise Exception( 'Each column in datasource must have unique non-empty name' ) datasource_record.creation_info = json.dumps(creation_info) datasource_record.data = json.dumps({ 'source_type': source_type, 'source': source, 'row_count': len(df), 'columns': [dict(name=x) for x in list(df.keys())] }) self.fs_store.put( name, f'datasource_{self.company_id}_{datasource_record.id}', self.dir) except Exception: if os.path.isdir(ds_meta_dir): shutil.rmtree(ds_meta_dir) raise session.commit() return self.get_datasource_obj(name, raw=True), name
def delete_all_streams(self): for stream in self.streams.copy(): self.streams[stream].set() del self.streams[stream] session.query(Stream).filter_by(company_id=self.company_id, integration=self.name).delete() session.commit()
def run_update(name: str, company_id: int): original_name = name name = f'{company_id}@@@@@{name}' fs_store = FsStore() config = Config() data_store = WithKWArgsWrapper(DataStore(), company_id=company_id) try: predictor_record = Predictor.query.filter_by( company_id=company_id, name=original_name).first() assert predictor_record is not None predictor_record.update_status = 'updating' session.commit() ds = data_store.get_datasource_obj(None, raw=False, id=predictor_record.datasource_id) df = ds.df problem_definition = predictor_record.learn_args problem_definition['target'] = predictor_record.to_predict[0] if 'join_learn_process' in problem_definition: del problem_definition['join_learn_process'] # Adapt kwargs to problem definition if 'timeseries_settings' in problem_definition: problem_definition['timeseries_settings'] = problem_definition[ 'timeseries_settings'] if 'stop_training_in_x_seconds' in problem_definition: problem_definition['time_aim'] = problem_definition[ 'stop_training_in_x_seconds'] json_ai = lightwood.json_ai_from_problem(df, problem_definition) predictor_record.json_ai = json_ai.to_dict() predictor_record.code = lightwood.code_from_json_ai(json_ai) predictor_record.data = {'training_log': 'training'} session.commit() predictor: lightwood.PredictorInterface = lightwood.predictor_from_code( predictor_record.code) predictor.learn(df) fs_name = f'predictor_{predictor_record.company_id}_{predictor_record.id}' pickle_path = os.path.join(config['paths']['predictors'], fs_name) predictor.save(pickle_path) fs_store.put(fs_name, fs_name, config['paths']['predictors']) predictor_record.data = predictor.model_analysis.to_dict( ) # type: ignore session.commit() predictor_record.lightwood_version = lightwood_version predictor_record.mindsdb_version = mindsdb_version predictor_record.update_status = 'up_to_date' session.commit() except Exception as e: log.error(e) predictor_record.update_status = 'update_failed' # type: ignore session.commit() return str(e)
def save_datasource(self, name, source_type, source, file_path=None, company_id=None): if source_type == 'file' and (file_path is None): raise Exception( '`file_path` argument required when source_type == "file"') datasource_record = session.query(Datasource).filter_by( company_id=company_id, name=name).first() while datasource_record is not None: raise Exception(f'Datasource with name {name} already exists') try: datasource_record = Datasource( company_id=company_id, name=name, datasources_version=mindsdb_datasources.__version__, mindsdb_version=mindsdb_version) session.add(datasource_record) session.commit() ds_meta_dir = os.path.join(self.dir, f'{company_id}@@@@@{name}') os.mkdir(ds_meta_dir) ds, creation_info = self.create_datasource(source_type, source, file_path, company_id, ds_meta_dir) if hasattr(ds, 'get_columns') and hasattr(ds, 'get_row_count'): try: column_names = ds.get_columns() row_count = ds.get_row_count() except Exception: df = ds.df column_names = list(df.keys()) row_count = len(df) else: df = ds.df column_names = list(df.keys()) row_count = len(df) if '' in column_names or len(column_names) != len( set(column_names)): shutil.rmtree(ds_meta_dir) raise Exception( 'Each column in datasource must have unique non-empty name' ) datasource_record.creation_info = json.dumps(creation_info) datasource_record.data = json.dumps({ 'source_type': source_type, 'source': source, 'row_count': row_count, 'columns': [dict(name=x) for x in column_names] }) self.fs_store.put( f'{company_id}@@@@@{name}', f'datasource_{company_id}_{datasource_record.id}', self.dir) session.commit() except Exception as e: log.error(f'Error creating datasource {name}, exception: {e}') try: self.delete_datasource(name, company_id=company_id) except Exception: pass raise e return self.get_datasource_obj(name, raw=True, company_id=company_id)
def add_db_integration(name, data, company_id): if 'database_name' not in data: data['database_name'] = name if 'publish' not in data: data['publish'] = True bundle_path = data.get('secure_connect_bundle') if data.get('type') in ('cassandra', 'scylla') and _is_not_empty_str(bundle_path): if os.path.isfile(bundle_path) is False: raise Exception(f'Can not get access to file: {bundle_path}') integrations_dir = Config()['paths']['integrations'] p = Path(bundle_path) data['secure_connect_bundle'] = p.name integration_record = Integration(name=name, data=data, company_id=company_id) session.add(integration_record) session.commit() integration_id = integration_record.id folder_name = f'integration_files_{company_id}_{integration_id}' integration_dir = os.path.join(integrations_dir, folder_name) create_directory(integration_dir) shutil.copyfile(bundle_path, os.path.join(integration_dir, p.name)) FsStore().put(folder_name, integration_dir, integrations_dir) elif data.get('type') in ('mysql', 'mariadb'): ssl = data.get('ssl') files = {} temp_dir = None if ssl is True: for key in ['ssl_ca', 'ssl_cert', 'ssl_key']: if key not in data: continue if os.path.isfile(data[key]) is False: if _is_not_empty_str(data[key]) is False: raise Exception( "'ssl_ca', 'ssl_cert' and 'ssl_key' must be paths or inline certs" ) if temp_dir is None: temp_dir = tempfile.mkdtemp( prefix='integration_files_') cert_file_name = data.get(f'{key}_name', f'{key}.pem') cert_file_path = os.path.join(temp_dir, cert_file_name) with open(cert_file_path, 'wt') as f: f.write(data[key]) data[key] = cert_file_path files[key] = data[key] p = Path(data[key]) data[key] = p.name integration_record = Integration(name=name, data=data, company_id=company_id) session.add(integration_record) session.commit() integration_id = integration_record.id if len(files) > 0: integrations_dir = Config()['paths']['integrations'] folder_name = f'integration_files_{company_id}_{integration_id}' integration_dir = os.path.join(integrations_dir, folder_name) create_directory(integration_dir) for file_path in files.values(): p = Path(file_path) shutil.copyfile(file_path, os.path.join(integration_dir, p.name)) FsStore().put(folder_name, integration_dir, integrations_dir) else: integration_record = Integration(name=name, data=data, company_id=company_id) session.add(integration_record) session.commit()
def _unlock_predictor(self, id): from mindsdb.interfaces.storage.db import session, Semaphor semaphor_record = session.query(Semaphor).filter_by(company_id=self.company_id, entity_id=id, entity_type='predictor').first() if semaphor_record is not None: session.delete(semaphor_record) session.commit()