def __init__(self): self.config = Config() self.fs_store = FsSotre() self.company_id = os.environ.get('MINDSDB_COMPANY_ID', None) self.dir = self.config.paths['datasources'] self.mindsdb_native = NativeInterface()
def __init__(self, config): self.config = config self.dbw = DatabaseWrapper(self.config) self.storage_dir = os.path.join(config['storage_dir'], 'misc') os.makedirs(self.storage_dir, exist_ok=True) self.model_cache = {} self.mindsdb_native = NativeInterface(self.config) self.dbw = DatabaseWrapper(self.config)
def __init__(self): self.config = Config() self.fs_store = FsSotre() self.company_id = os.environ.get('MINDSDB_COMPANY_ID', None) self.dbw = DatabaseWrapper() self.storage_dir = self.config['paths']['custom_models'] os.makedirs(self.storage_dir, exist_ok=True) self.model_cache = {} self.mindsdb_native = NativeInterface() self.dbw = DatabaseWrapper()
def initialize_interfaces(app): app.default_store = DataStore() app.mindsdb_native = NativeInterface() app.custom_models = CustomModels() app.dbw = DatabaseWrapper() config = Config() app.config_obj = config
def run_environment(config, apis=['mysql'], override_integration_config={}, override_api_config={}, mindsdb_database='mindsdb', clear_storage=True): temp_config_path = prepare_config(config, mindsdb_database, override_integration_config, override_api_config, clear_storage) config = Config(temp_config_path) api_str = ','.join(apis) sp = subprocess.Popen([ 'python3', '-m', 'mindsdb', '--api', api_str, '--config', temp_config_path, '--verbose' ], close_fds=True, stdout=OUTPUT, stderr=OUTPUT) atexit.register(stop_mindsdb, sp=sp) async def wait_port_async(port, timeout): start_time = time.time() started = is_port_in_use(port) while (time.time() - start_time) < timeout and started is False: await asyncio.sleep(1) started = is_port_in_use(port) return started async def wait_apis_start(ports): futures = [wait_port_async(port, 60) for port in ports] success = True for i, future in enumerate(asyncio.as_completed(futures)): success = success and await future return success ports_to_wait = [config['api'][api]['port'] for api in apis] ioloop = asyncio.get_event_loop() if ioloop.is_closed(): ioloop = asyncio.new_event_loop() success = ioloop.run_until_complete(wait_apis_start(ports_to_wait)) ioloop.close() if not success: raise Exception('Cant start mindsdb apis') CONFIG.MINDSDB_STORAGE_PATH = config.paths['predictors'] mdb = NativeInterface(config) datastore = DataStore(config) return mdb, datastore
def __init__(self, config): mongodb_config = config['api'].get('mongodb') assert mongodb_config is not None, 'is no mongodb config!' host = mongodb_config['host'] port = mongodb_config['port'] log.debug(f'start mongo server on {host}:{port}') super().__init__((host, int(port)), MongoRequestHandler) self.mindsdb_env = { 'config': config, 'data_store': DataStore(), 'mindsdb_native': NativeInterface() } respondersCollection = RespondersCollection() opQueryResponder = OpQueryResponder(respondersCollection) opMsgResponder = OpMsgResponder(respondersCollection) opInsertResponder = OpInsertResponder(respondersCollection) self.operationsHandlersMap = { OP_QUERY: opQueryResponder, OP_MSG: opMsgResponder, OP_INSERT: opInsertResponder } respondersCollection.add(when={'drop': 'system.sessions'}, result={'ok': 1}) respondersCollection.add(when={'update': 'system.version'}, result={'ok': 1}) respondersCollection.add( when={'setFeatureCompatibilityVersion': helpers.is_true}, result={'ok': 1}) # OpMSG=OrderedDict([('features', 1), ('$clusterTime', OrderedDict([('clusterTime', Timestamp(1599748325, 1)), ('signature', OrderedDict([('hash', b'\xb8\xc3\x03\x18\xca\xe6bh\xf0\xcb47,\x924\x8a >\xfc\x91'), ('keyId', 6870854312365391875)]))])), ('$configServerState', OrderedDict([('opTime', OrderedDict([('ts', Timestamp(1599748325, 1)), ('t', 1)]))])), ('$db', 'admin')]) respondersCollection.add(when={'features': helpers.is_true}, result={'ok': 1}) # OpMSG=OrderedDict([('serverStatus', 1), ('$clusterTime', OrderedDict([('clusterTime', Timestamp(1599748366, 1)), ('signature', OrderedDict([('hash', b'\xa1E}\xbbIU\xc2D\x95++\x82\x88\xb5\x84\xf5\xda)+B'), ('keyId', 6870854312365391875)]))])), ('$configServerState', OrderedDict([('opTime', OrderedDict([('ts', Timestamp(1599748366, 1)), ('t', 1)]))])), ('$db', 'admin')]) respondersCollection.add(when={'serverStatus': helpers.is_true}, result={'ok': 1}) # OpMSG=OrderedDict([('ismaster', 1), ('$db', 'admin'), ('$clusterTime', OrderedDict([('clusterTime', Timestamp(1599749031, 1)), ('signature', OrderedDict([('hash', b'6\x87\xd5Y\xa7\xc7\xcf$\xab\x1e\xa2{\xe5B\xe5\x99\xdbl\x8d\xf4'), ('keyId', 6870854312365391875)]))])), ('$client', OrderedDict([('application', OrderedDict([('name', 'MongoDB Shell')])), ('driver', OrderedDict([('name', 'MongoDB Internal Client'), ('version', '3.6.3')])), ('os', OrderedDict([('type', 'Linux'), ('name', 'Ubuntu'), ('architecture', 'x86_64'), ('version', '18.04')])), ('mongos', OrderedDict([('host', 'maxs-comp:27103'), ('client', '127.0.0.1:52148'), ('version', '3.6.3')]))])), ('$configServerState', OrderedDict([('opTime', OrderedDict([('ts', Timestamp(1599749031, 1)), ('t', 1)]))]))]) respondersCollection.responders += responders
def __init__(self, config): self.config = config self.mindsdb_native = NativeInterface(config) self.custom_models = CustomModels(config)
class MindsDBDataNode(DataNode): type = 'mindsdb' def __init__(self, config): self.config = config self.mindsdb_native = NativeInterface(config) self.custom_models = CustomModels(config) def getTables(self): models = self.mindsdb_native.get_models() models = [x['name'] for x in models if x['status'] == 'complete'] models += ['predictors', 'commands'] models += [x['name'] for x in self.custom_models.get_models()] return models def hasTable(self, table): return table in self.getTables() def getTableColumns(self, table): try: columns = self.custom_models.get_model_data( table)['data_analysis_v2']['columns'] columns += [ 'external_datasource', 'select_data_query', 'when_data' ] return columns except Exception: pass if table == 'predictors': return [ 'name', 'status', 'accuracy', 'predict', 'select_data_query', 'external_datasource', 'training_options' ] if table == 'commands': return ['command'] model = self.mindsdb_native.get_model_data(name=table) columns = [] columns += model['data_analysis_v2']['columns'] columns += [f'{x}_original' for x in model['predict']] for col in model['predict']: if model['data_analysis_v2'][col]['typing'][ 'data_type'] == 'Numeric': columns += [f"{col}_min", f"{col}_max"] columns += [f"{col}_confidence"] columns += [f"{col}_explain"] # TODO this should be added just for clickhouse queries columns += ['when_data', 'select_data_query', 'external_datasource'] return columns def _select_predictors(self): models = self.mindsdb_native.get_models() # TODO add custom models return [ { 'name': x['name'], 'status': x['status'], 'accuracy': str(x['accuracy']) if x['accuracy'] is not None else None, 'predict': ', '.join(x['predict']), 'select_data_query': '', 'external_datasource': '', # TODO 'training_options': '' # TODO ? } for x in models ] def delete_predictor(self, name): self.mindsdb_native.delete_model(name) def select(self, table, columns=None, where=None, where_data=None, order_by=None, group_by=None, came_from=None): ''' NOTE WHERE statements can be just $eq joined with 'and' ''' if table == 'predictors': return self._select_predictors() if table == 'commands': return [] original_when_data = None if 'when_data' in where: if len(where) > 1: raise ValueError( "Should not be used any other keys in 'where', if 'when_data' used" ) try: original_when_data = where['when_data']['$eq'] where_data = json.loads(where['when_data']['$eq']) if isinstance(where_data, list) is False: where_data = [where_data] except Exception: raise ValueError( f'''Error while parse 'when_data'="{where_data}"''') external_datasource = None if 'external_datasource' in where: external_datasource = where['external_datasource']['$eq'] del where['external_datasource'] select_data_query = None if came_from is not None and 'select_data_query' in where: select_data_query = where['select_data_query']['$eq'] del where['select_data_query'] dbtype = self.config['integrations'][came_from]['type'] if dbtype == 'clickhouse': ch = Clickhouse(self.config, came_from) res = ch._query( select_data_query.strip(' ;\n') + ' FORMAT JSON') data = res.json()['data'] elif dbtype == 'mariadb': maria = Mariadb(self.config, came_from) data = maria._query(select_data_query) elif dbtype == 'mysql': mysql = MySQL(self.config, came_from) data = mysql._query(select_data_query) elif dbtype == 'postgres': mysql = PostgreSQL(self.config, came_from) data = mysql._query(select_data_query) elif dbtype == 'mssql': mssql = MSSQL(self.config, came_from) data = mssql._query(select_data_query, fetch=True) else: raise Exception(f'Unknown database type: {dbtype}') if where_data is None: where_data = data else: where_data += data new_where = {} if where_data is not None: where_data = pandas.DataFrame(where_data) else: for key, value in where.items(): if isinstance(value, dict) is False or len( value.keys()) != 1 or list(value.keys())[0] != '$eq': # TODO value should be just string or number raise Exception() new_where[key] = value['$eq'] if len(new_where) == 0: return [] where_data = [new_where] try: model = self.custom_models.get_model_data(name=table) except Exception: model = self.mindsdb_native.get_model_data(name=table) predicted_columns = model['predict'] original_target_values = {} for col in predicted_columns: if where_data is not None: if col in where_data: original_target_values[col + '_original'] = list( where_data[col]) else: original_target_values[col + '_original'] = [None ] * len(where_data) else: original_target_values[col + '_original'] = [None] if table in [x['name'] for x in self.custom_models.get_models()]: res = self.custom_models.predict(name=table, when_data=where_data) data = [] fields = model['data_analysis_v2']['columns'] for i, ele in enumerate(res): row = {} row['select_data_query'] = select_data_query row['external_datasource'] = external_datasource row['when_data'] = original_when_data for key in ele: row[key] = ele[key]['predicted_value'] # FIXME prefer get int from mindsdb_native in this case if model['data_analysis_v2'][key]['typing'][ 'data_subtype'] == 'Int': row[key] = int(row[key]) for k in fields: if k not in ele: if isinstance(where_data, list): if k in where_data[i]: row[k] = where_data[i][k] else: row[k] = None elif k in where_data.columns: row[k] = where_data[k].iloc[i] else: row[k] = None for k in original_target_values: row[k] = original_target_values[k][i] data.append(row) field_types = { f: model['data_analysis_v2'][f]['typing']['data_subtype'] for f in fields if 'typing' in model['data_analysis_v2'][f] } for row in data: cast_row_types(row, field_types) return data else: res = self.mindsdb_native.predict(name=table, when_data=where_data) keys = [x for x in list(res._data.keys()) if x in columns] min_max_keys = [] for col in predicted_columns: if model['data_analysis_v2'][col]['typing'][ 'data_type'] == 'Numeric': min_max_keys.append(col) data = [] explains = [] for i, el in enumerate(res): data.append({key: el[key] for key in keys}) explains.append(el.explain()) field_types = { f: model['data_analysis_v2'][f]['typing']['data_subtype'] for f in model['data_analysis_v2']['columns'] if 'typing' in model['data_analysis_v2'][f] } for row in data: cast_row_types(row, field_types) row['select_data_query'] = select_data_query row['external_datasource'] = external_datasource row['when_data'] = original_when_data for k in original_target_values: row[k] = original_target_values[k][i] explanation = explains[i] for key in predicted_columns: row[key + '_confidence'] = explanation[key]['confidence'] row[key + '_explain'] = json.dumps(explanation[key], cls=NumpyJSONEncoder) for key in min_max_keys: row[key + '_min'] = min( explanation[key]['confidence_interval']) row[key + '_max'] = max( explanation[key]['confidence_interval']) return data
class DataStore(): def __init__(self): self.config = Config() self.fs_store = FsSotre() self.company_id = os.environ.get('MINDSDB_COMPANY_ID', None) self.dir = self.config.paths['datasources'] self.mindsdb_native = NativeInterface() def get_analysis(self, name): datasource_record = session.query(Datasource).filter_by( company_id=self.company_id, name=name).first() if datasource_record.analysis is None: datasource_record.analysis = json.dumps( self.mindsdb_native.analyse_dataset( self.get_datasource_obj(name))) session.commit() analysis = json.loads(datasource_record.analysis) return analysis def get_datasources(self, name=None): datasource_arr = [] if name is not None: datasource_record_arr = session.query(Datasource).filter_by( company_id=self.company_id, name=name) else: datasource_record_arr = session.query(Datasource).filter_by( company_id=self.company_id) for datasource_record in datasource_record_arr: try: datasource = json.loads(datasource_record.data) datasource['created_at'] = datasource_record.created_at datasource['updated_at'] = datasource_record.updated_at datasource['name'] = datasource_record.name datasource['id'] = datasource_record.id datasource_arr.append(datasource) except Exception as e: log.error(e) return datasource_arr def get_data(self, name, where=None, limit=None, offset=None): offset = 0 if offset is None else offset ds = self.get_datasource_obj(name) if limit is not None: # @TODO Add `offset` to the `filter` method of the datasource and get rid of `offset` filtered_ds = ds.filter(where=where, limit=limit + offset).iloc[offset:] else: filtered_ds = ds.filter(where=where) filtered_ds = filtered_ds.where(pd.notnull(filtered_ds), None) data = filtered_ds.to_dict(orient='records') return { 'data': data, 'rowcount': len(ds), 'columns_names': filtered_ds.columns } def get_datasource(self, name): datasource_arr = self.get_datasources(name) if len(datasource_arr) == 1: return datasource_arr[0] # @TODO: Remove when db swithc is more stable, this should never happen, but good santiy check while this is kinda buggy elif len(datasource_arr) > 1: log.error('Two or more datasource with the same name, (', len(datasource_arr), ') | Full list: ', datasource_arr) raise Exception('Two or more datasource with the same name') return None def delete_datasource(self, name): datasource_record = Datasource.query.filter_by( company_id=self.company_id, name=name).first() id = datasource_record.id session.delete(datasource_record) session.commit() self.fs_store.delete( f'datasource_{self.company_id}_{datasource_record.id}') try: shutil.rmtree(os.path.join(self.dir, name)) except Exception: pass def save_datasource(self, name, source_type, source, file_path=None): datasource_record = Datasource(company_id=self.company_id, name=name) if source_type == 'file' and (file_path is None): raise Exception( '`file_path` argument required when source_type == "file"') ds_meta_dir = os.path.join(self.dir, name) os.mkdir(ds_meta_dir) session.add(datasource_record) session.commit() datasource_record = session.query(Datasource).filter_by( company_id=self.company_id, name=name).first() try: if source_type == 'file': source = os.path.join(ds_meta_dir, source) shutil.move(file_path, source) ds = FileDS(source) creation_info = { 'class': 'FileDS', 'args': [source], 'kwargs': {} } elif source_type in self.config['integrations']: integration = self.config['integrations'][source_type] ds_class_map = { 'clickhouse': ClickhouseDS, 'mariadb': MariaDS, 'mysql': MySqlDS, 'postgres': PostgresDS, 'mssql': MSSQLDS, 'mongodb': MongoDS, 'snowflake': SnowflakeDS } try: dsClass = ds_class_map[integration['type']] except KeyError: raise KeyError( f"Unknown DS type: {source_type}, type is {integration['type']}" ) if integration['type'] in ['clickhouse']: creation_info = { 'class': dsClass.__name__, 'args': [], 'kwargs': { 'query': source['query'], 'user': integration['user'], 'password': integration['password'], 'host': integration['host'], 'port': integration['port'] } } ds = dsClass(**creation_info['kwargs']) elif integration['type'] in [ 'mssql', 'postgres', 'mariadb', 'mysql' ]: creation_info = { 'class': dsClass.__name__, 'args': [], 'kwargs': { 'query': source['query'], 'user': integration['user'], 'password': integration['password'], 'host': integration['host'], 'port': integration['port'] } } if 'database' in integration: creation_info['kwargs']['database'] = integration[ 'database'] if 'database' in source: creation_info['kwargs']['database'] = source[ 'database'] ds = dsClass(**creation_info['kwargs']) elif integration['type'] == 'snowflake': creation_info = { 'class': dsClass.__name__, 'args': [], 'kwargs': { 'query': source['query'], 'schema': source['schema'], 'warehouse': source['warehouse'], 'database': source['database'], 'host': integration['host'], 'password': integration['password'], 'user': integration['user'], 'account': integration['account'] } } ds = dsClass(**creation_info['kwargs']) elif integration['type'] == 'mongodb': if isinstance(source['find'], str): source['find'] = json.loads(source['find']) creation_info = { 'class': dsClass.__name__, 'args': [], 'kwargs': { 'database': source['database'], 'collection': source['collection'], 'query': source['find'], 'user': integration['user'], 'password': integration['password'], 'host': integration['host'], 'port': integration['port'] } } ds = dsClass(**creation_info['kwargs']) else: # This probably only happens for urls ds = FileDS(source) creation_info = { 'class': 'FileDS', 'args': [source], 'kwargs': {} } df = ds.df if '' in df.columns or len(df.columns) != len(set(df.columns)): shutil.rmtree(ds_meta_dir) raise Exception( 'Each column in datasource must have unique non-empty name' ) datasource_record.creation_info = json.dumps(creation_info) datasource_record.data = json.dumps({ 'source_type': source_type, 'source': source, 'row_count': len(df), 'columns': [dict(name=x) for x in list(df.keys())] }) self.fs_store.put( name, f'datasource_{self.company_id}_{datasource_record.id}', self.dir) except Exception: if os.path.isdir(ds_meta_dir): shutil.rmtree(ds_meta_dir) raise session.commit() return self.get_datasource_obj(name, raw=True), name def get_datasource_obj(self, name, raw=False): try: datasource_record = session.query(Datasource).filter_by( company_id=self.company_id, name=name).first() self.fs_store.get( name, f'datasource_{self.company_id}_{datasource_record.id}', self.dir) creation_info = json.loads(datasource_record.creation_info) if raw: return creation_info else: return eval(creation_info['class'])(*creation_info['args'], **creation_info['kwargs']) except Exception as e: log.error(f'\n{e}\n') return None
def __init__(self, config): self.config = Config() self.mindsdb_native = NativeInterface() self.custom_models = CustomModels() self.ai_table = AITable_store() self.default_store = DataStore()
for api_name in apis.keys(): if api_name not in config['api']: print( f"Trying run '{api_name}' API, but is no config for this api.") print(f"Please, fill config['api']['{api_name}']") sys.exit(0) start_functions = { 'http': start_http, 'mysql': start_mysql, 'mongodb': start_mongo } archive_obsolete_predictors(config, '2.11.0') mdb = NativeInterface(config) cst = CustomModels(config) remove_corrupted_predictors(config, mdb) model_data_arr = get_all_models_meta_data(mdb, cst) dbw = DatabaseWrapper(config) for db_alias in config['integrations']: dbw.setup_integration(db_alias) dbw.register_predictors(model_data_arr) for broken_name in [ name for name, connected in dbw.check_connections().items() if connected is False ]:
class CustomModels(): def __init__(self): self.config = Config() self.fs_store = FsSotre() self.company_id = os.environ.get('MINDSDB_COMPANY_ID', None) self.dbw = DatabaseWrapper() self.storage_dir = self.config['paths']['custom_models'] os.makedirs(self.storage_dir, exist_ok=True) self.model_cache = {} self.mindsdb_native = NativeInterface() self.dbw = DatabaseWrapper() def _dir(self, name): return str(os.path.join(self.storage_dir, name)) def _internal_load(self, name): self.fs_store.get(name, f'custom_model_{self.company_id}_{name}', self.storage_dir) sys.path.insert(0, self._dir(name)) module = __import__(name) try: model = module.Model.load( os.path.join(self._dir(name), 'model.pickle')) except Exception as e: model = module.Model() model.initialize_column_types() if hasattr(model, 'setup'): model.setup() self.model_cache[name] = model return model def learn(self, name, from_data, to_predict, datasource_id, kwargs={}): model_data = self.get_model_data(name) model_data['status'] = 'training' self.save_model_data(name, model_data) to_predict = to_predict if isinstance(to_predict, list) else [to_predict] data_source = getattr(mindsdb_native, from_data['class'])(*from_data['args'], **from_data['kwargs']) data_frame = data_source.df model = self._internal_load(name) model.to_predict = to_predict model_data = self.get_model_data(name) model_data['predict'] = model.to_predict self.save_model_data(name, model_data) data_analysis = self.mindsdb_native.analyse_dataset( data_source)['data_analysis_v2'] model_data = self.get_model_data(name) model_data['data_analysis_v2'] = data_analysis self.save_model_data(name, model_data) model.fit(data_frame, to_predict, data_analysis, kwargs) model.save(os.path.join(self._dir(name), 'model.pickle')) self.model_cache[name] = model model_data = self.get_model_data(name) model_data['status'] = 'completed' model_data['columns'] = list(data_analysis.keys()) self.save_model_data(name, model_data) self.fs_store.put(name, f'custom_model_{self.company_id}_{name}', self.storage_dir) self.dbw.unregister_predictor(name) self.dbw.register_predictors([self.get_model_data(name)]) def predict(self, name, when_data=None, from_data=None, kwargs=None): self.fs_store.get(name, f'custom_model_{self.company_id}_{name}', self.storage_dir) if kwargs is None: kwargs = {} if from_data is not None: if isinstance(from_data, dict): data_source = getattr(mindsdb_native, from_data['class'])( *from_data['args'], **from_data['kwargs']) # assume that particular instance of any DataSource class is provided else: data_source = from_data data_frame = data_source.df elif when_data is not None: if isinstance(when_data, dict): for k in when_data: when_data[k] = [when_data[k]] data_frame = pd.DataFrame(when_data) else: data_frame = pd.DataFrame(when_data) model = self._internal_load(name) predictions = model.predict(data_frame, kwargs) pred_arr = [] for i in range(len(predictions)): pred_arr.append({}) pred_arr[-1] = {} for col in predictions.columns: pred_arr[-1][col] = {} pred_arr[-1][col]['predicted_value'] = predictions[col].iloc[i] return pred_arr def get_model_data(self, name): predictor_record = Predictor.query.filter_by( company_id=self.company_id, name=name, is_custom=True).first() return predictor_record.data def save_model_data(self, name, data): predictor_record = Predictor.query.filter_by( company_id=self.company_id, name=name, is_custom=True).first() if predictor_record is None: predictor_record = Predictor(company_id=self.company_id, name=name, is_custom=True, data=data) session.add(predictor_record) else: predictor_record.data = data session.commit() def get_models(self): predictor_names = [ x.name for x in Predictor.query.filter_by(company_id=self.company_id, is_custom=True) ] models = [] for name in predictor_names: models.append(self.get_model_data(name)) return models def delete_model(self, name): Predictor.query.filter_by(company_id=self.company_id, name=name, is_custom=True).delete() session.commit() shutil.rmtree(self._dir(name)) self.dbw.unregister_predictor(name) self.fs_store.delete(f'custom_model_{self.company_id}_{name}') def rename_model(self, name, new_name): self.fs_store.get(name, f'custom_model_{self.company_id}_{name}', self.storage_dir) self.dbw.unregister_predictor(name) shutil.move(self._dir(name), self._dir(new_name)) shutil.move(os.path.join(self._dir(new_name) + f'{name}.py'), os.path.join(self._dir(new_name), f'{new_name}.py')) predictor_record = Predictor.query.filter_by( company_id=self.company_id, name=name, is_custom=True).first() predictor_record.name = new_name session.commit() self.dbw.register_predictors([self.get_model_data(new_name)]) self.fs_store.put(name, f'custom_model_{self.company_id}_{new_name}', self.storage_dir) self.fs_store.delete(f'custom_model_{self.company_id}_{name}') def export_model(self, name): shutil.make_archive(base_name=name, format='zip', root_dir=self._dir(name)) return str(self._dir(name)) + '.zip' def load_model(self, fpath, name, trained_status): shutil.unpack_archive(fpath, self._dir(name), 'zip') shutil.move(os.path.join(self._dir(name), 'model.py'), os.path.join(self._dir(name), f'{name}.py')) model = self._internal_load(name) model.to_predict = model.to_predict if isinstance( model.to_predict, list) else [model.to_predict] self.save_model_data( name, { 'name': name, 'data_analysis_v2': model.column_type_map, 'predict': model.to_predict, 'status': trained_status, 'is_custom': True, 'columns': list(model.column_type_map.keys()) }) with open(os.path.join(self._dir(name), '__init__.py'), 'w') as fp: fp.write('') self.fs_store.put(name, f'custom_model_{self.company_id}_{name}', self.storage_dir) if trained_status == 'trained': self.dbw.register_predictors([self.get_model_data(name)])
} for api in api_arr } for api_name in apis.keys(): if api_name not in config['api']: print(f"Trying run '{api_name}' API, but is no config for this api.") print(f"Please, fill config['api']['{api_name}']") sys.exit(0) start_functions = { 'http': start_http, 'mysql': start_mysql, 'mongodb': start_mongo } mdb = NativeInterface() cst = CustomModels() model_data_arr = get_all_models_meta_data(mdb, cst) dbw = DatabaseWrapper() for db_alias in config['integrations']: dbw.setup_integration(db_alias) dbw.register_predictors(model_data_arr) for broken_name in [name for name, connected in dbw.check_connections().items() if connected is False]: log.error(f'Error failed to integrate with database aliased: {broken_name}') ctx = mp.get_context('spawn') # Switch to this once the native interface has it's own thread :/ # ctx = mp.get_context(get_mp_context())
class CustomModels(): def __init__(self, config): self.config = config self.dbw = DatabaseWrapper(self.config) self.storage_dir = os.path.join(config['storage_dir'], 'misc') os.makedirs(self.storage_dir, exist_ok=True) self.model_cache = {} self.mindsdb_native = NativeInterface(self.config) self.dbw = DatabaseWrapper(self.config) def _dir(self, name): return str(os.path.join(self.storage_dir, 'custom_model_' + name)) def _internal_load(self, name): # Caching (2 lines bellow), currently disabled due to multiprocessing cache invalidation issues #if name in self.model_cache: # return self.model_cache[name] # "Proper" model loading (3 lines bellow), currently disabled due to pickling issues #spec = importlib.util.spec_from_file_location(name, self._dir(name) + '/model.py') #module = importlib.util.module_from_spec(spec) #spec.loader.exec_module(module) sys.path.insert(0, self._dir(name)) module = __import__(name) try: model = module.Model.load( os.path.join(self._dir(name), 'model.pickle')) except Exception as e: model = module.Model() model.initialize_column_types() if hasattr(model, 'setup'): model.setup() self.model_cache[name] = model return model def learn(self, name, from_data, to_predict, kwargs={}): model_data = self.get_model_data(name) model_data['status'] = 'training' self.save_model_data(name, model_data) to_predict = to_predict if isinstance(to_predict, list) else [to_predict] data_source = getattr(mindsdb_native, from_data['class'])(*from_data['args'], **from_data['kwargs']) data_frame = data_source.df model = self._internal_load(name) model.to_predict = to_predict model_data = self.get_model_data(name) model_data['predict'] = model.to_predict self.save_model_data(name, model_data) data_analysis = self.mindsdb_native.analyse_dataset( data_source)['data_analysis_v2'] model_data = self.get_model_data(name) model_data['data_analysis_v2'] = data_analysis self.save_model_data(name, model_data) model.fit(data_frame, to_predict, data_analysis, kwargs) model.save(os.path.join(self._dir(name), 'model.pickle')) self.model_cache[name] = model model_data = self.get_model_data(name) model_data['status'] = 'completed' self.save_model_data(name, model_data) self.dbw.unregister_predictor(name) self.dbw.register_predictors([self.get_model_data(name)]) def predict(self, name, when_data=None, from_data=None, kwargs=None): if kwargs is None: kwargs = {} if from_data is not None: if isinstance(from_data, dict): data_source = getattr(mindsdb_native, from_data['class'])( *from_data['args'], **from_data['kwargs']) # assume that particular instance of any DataSource class is provided else: data_source = from_data data_frame = data_source.df elif when_data is not None: if isinstance(when_data, dict): for k in when_data: when_data[k] = [when_data[k]] data_frame = pd.DataFrame(when_data) else: data_frame = pd.DataFrame(when_data) model = self._internal_load(name) predictions = model.predict(data_frame, kwargs) pred_arr = [] for i in range(len(predictions)): pred_arr.append({}) pred_arr[-1] = {} for col in predictions.columns: pred_arr[-1][col] = {} pred_arr[-1][col]['predicted_value'] = predictions[col].iloc[i] return pred_arr def get_model_data(self, name): with open(os.path.join(self._dir(name), 'metadata.json'), 'r') as fp: return json.load(fp) def save_model_data(self, name, data): with open(os.path.join(self._dir(name), 'metadata.json'), 'w') as fp: json.dump(data, fp) def get_models(self): models = [] for model_dir in os.listdir(self.storage_dir): if 'custom_model_' in model_dir: name = model_dir.replace('custom_model_', '') try: models.append(self.get_model_data(name)) except: print(f'Model {name} not found !') return models def delete_model(self, name): shutil.rmtree(self._dir(name)) self.dbw.unregister_predictor(name) def rename_model(self, name, new_name): self.dbw.unregister_predictor(name) shutil.move(self._dir(name), self._dir(new_name)) shutil.move(os.path.join(self._dir(new_name) + f'{name}.py'), os.path.join(self._dir(new_name), f'{new_name}.py')) self.dbw.register_predictors([self.get_model_data(new_name)]) def export_model(self, name): shutil.make_archive(base_name=name, format='zip', root_dir=self._dir(name)) return str(self._dir(name)) + '.zip' def load_model(self, fpath, name, trained_status): shutil.unpack_archive(fpath, self._dir(name), 'zip') shutil.move(os.path.join(self._dir(name), 'model.py'), os.path.join(self._dir(name), f'{name}.py')) model = self._internal_load(name) model.to_predict = model.to_predict if isinstance( model.to_predict, list) else [model.to_predict] self.save_model_data( name, { 'name': name, 'data_analysis_v2': model.column_type_map, 'predict': model.to_predict, 'status': trained_status, 'is_custom': True }) with open(os.path.join(self._dir(name), '__init__.py'), 'w') as fp: fp.write('') if trained_status == 'trained': self.dbw.register_predictors([self.get_model_data(name)])
def __init__(self, config): self.config = config self.dir = config.paths['datasources'] self.mindsdb_native = NativeInterface(config)
class DataStore(): def __init__(self, config): self.config = config self.dir = config.paths['datasources'] self.mindsdb_native = NativeInterface(config) def get_analysis(self, ds): return self.mindsdb_native.analyse_dataset(self.get_datasource_obj(ds)) def get_datasources(self): datasource_arr = [] for ds_name in os.listdir(self.dir): try: with open(os.path.join(self.dir, ds_name, 'metadata.json'), 'r') as fp: try: datasource = json.load(fp) datasource['created_at'] = parse_dt( datasource['created_at'].split('.')[0]) datasource['updated_at'] = parse_dt( datasource['updated_at'].split('.')[0]) datasource_arr.append(datasource) except Exception as e: print(e) except Exception as e: print(e) return datasource_arr def get_data(self, name, where=None, limit=None, offset=None): offset = 0 if offset is None else offset ds = self.get_datasource_obj(name) if limit is not None: # @TODO Add `offset` to the `filter` method of the datasource and get rid of `offset` filtered_ds = ds.filter(where=where, limit=limit + offset).iloc[offset:] else: filtered_ds = ds.filter(where=where) data = filtered_ds.to_dict(orient='records') return { 'data': data, 'rowcount': len(ds), 'columns_names': filtered_ds.columns } def get_datasource(self, name): for ds in self.get_datasources(): if ds['name'] == name: return ds return None def delete_datasource(self, name): shutil.rmtree(os.path.join(self.dir, name)) def save_datasource(self, name, source_type, source, file_path=None): if source_type == 'file' and (file_path is None): raise Exception( '`file_path` argument required when source_type == "file"') for i in range(1, 1000): if name in [x['name'] for x in self.get_datasources()]: previous_index = i - 1 name = name.replace(f'__{previous_index}__', '') name = f'{name}__{i}__' else: break ds_meta_dir = os.path.join(self.dir, name) os.mkdir(ds_meta_dir) try: if source_type == 'file': source = os.path.join(ds_meta_dir, source) shutil.move(file_path, source) ds = FileDS(source) picklable = {'class': 'FileDS', 'args': [source], 'kwargs': {}} elif source_type in self.config['integrations']: integration = self.config['integrations'][source_type] ds_class_map = { 'clickhouse': ClickhouseDS, 'mariadb': MariaDS, 'mysql': MySqlDS, 'postgres': PostgresDS, 'mssql': MSSQLDS, 'mongodb': MongoDS, 'snowflake': SnowflakeDS } try: dsClass = ds_class_map[integration['type']] except KeyError: raise KeyError( f"Unknown DS type: {source_type}, type is {integration['type']}" ) if integration['type'] in ['clickhouse']: picklable = { 'class': dsClass.__name__, 'args': [], 'kwargs': { 'query': source['query'], 'user': integration['user'], 'password': integration['password'], 'host': integration['host'], 'port': integration['port'] } } ds = dsClass(**picklable['kwargs']) elif integration['type'] in [ 'mssql', 'postgres', 'mariadb', 'mysql' ]: picklable = { 'class': dsClass.__name__, 'args': [], 'kwargs': { 'query': source['query'], 'user': integration['user'], 'password': integration['password'], 'host': integration['host'], 'port': integration['port'] } } if 'database' in integration: picklable['kwargs']['database'] = integration[ 'database'] if 'database' in source: picklable['kwargs']['database'] = source['database'] ds = dsClass(**picklable['kwargs']) elif integration['type'] == 'snowflake': picklable = { 'class': dsClass.__name__, 'args': [], 'kwargs': { 'query': source['query'], 'schema': source['schema'], 'warehouse': source['warehouse'], 'database': source['database'], 'host': integration['host'], 'password': integration['password'], 'user': integration['user'], 'account': integration['account'] } } ds = dsClass(**picklable['kwargs']) elif integration['type'] == 'mongodb': if isinstance(source['find'], str): source['find'] = json.loads(source['find']) picklable = { 'class': dsClass.__name__, 'args': [], 'kwargs': { 'database': source['database'], 'collection': source['collection'], 'query': source['find'], 'user': integration['user'], 'password': integration['password'], 'host': integration['host'], 'port': integration['port'] } } ds = dsClass(**picklable['kwargs']) else: # This probably only happens for urls ds = FileDS(source) picklable = {'class': 'FileDS', 'args': [source], 'kwargs': {}} df = ds.df if '' in df.columns or len(df.columns) != len(set(df.columns)): shutil.rmtree(ds_meta_dir) raise Exception( 'Each column in datasource must have unique name') with open(os.path.join(ds_meta_dir, 'ds.pickle'), 'wb') as fp: pickle.dump(picklable, fp) with open(os.path.join(ds_meta_dir, 'metadata.json'), 'w') as fp: meta = { 'name': name, 'source_type': source_type, 'source': source, 'created_at': str(datetime.datetime.now()).split('.')[0], 'updated_at': str(datetime.datetime.now()).split('.')[0], 'row_count': len(df), 'columns': [dict(name=x) for x in list(df.keys())] } json.dump(meta, fp, indent=4, sort_keys=True) with open(os.path.join(ds_meta_dir, 'versions.json'), 'wt') as fp: json.dump(self.config.versions, fp, indent=4, sort_keys=True) except Exception: if os.path.isdir(ds_meta_dir): shutil.rmtree(ds_meta_dir) raise return self.get_datasource_obj(name, raw=True), name def get_datasource_obj(self, name, raw=False): ds_meta_dir = os.path.join(self.dir, name) ds = None try: with open(os.path.join(ds_meta_dir, 'ds.pickle'), 'rb') as fp: picklable = pickle.load(fp) if raw: return picklable try: ds = eval(picklable['class'])(*picklable['args'], **picklable['kwargs']) except Exception: ds = picklable return ds except Exception as e: print(f'\n{e}\n') return None