class DataStore(): def __init__(self, config, storage_dir=None): self.config = config self.dir = storage_dir if isinstance( storage_dir, str) else config.paths['datasources'] self.mindsdb_native = MindsdbNative(config) def get_analysis(self, ds): if isinstance(ds, str): return self.mindsdb_native.analyse_dataset( self.get_datasource_obj(ds)) else: return self.mindsdb_native.analyse_dataset(ds) def get_datasources(self): datasource_arr = [] for ds_name in os.listdir(self.dir): try: with open( os.path.join(self.dir, ds_name, 'datasource', 'metadata.json'), 'r') as fp: try: datasource = json.load(fp) datasource['created_at'] = parse_dt( datasource['created_at'].split('.')[0]) datasource['updated_at'] = parse_dt( datasource['updated_at'].split('.')[0]) datasource_arr.append(datasource) except Exception as e: print(e) except Exception as e: print(e) return datasource_arr def get_data(self, name, where=None, limit=None, offset=None): # @TODO Apply filter directly to postgres/mysql/clickhouse/etc... when the datasource is of that type return get_sqlite_data(os.path.join(self.dir, name, 'datasource', 'sqlite.db'), where=where, limit=limit, offset=offset) def get_datasource(self, name): for ds in self.get_datasources(): if ds['name'] == name: return ds return None def delete_datasource(self, name): data_sources = self.get_datasource(name) shutil.rmtree(os.path.join(self.dir, data_sources['name'])) def save_datasource(self, name, source_type, source, file_path=None): if source_type == 'file' and (file_path is None): raise Exception( '`file_path` argument required when source_type == "file"') for i in range(1, 1000): if name in [x['name'] for x in self.get_datasources()]: previous_index = i - 1 name = name.replace(f'__{previous_index}__', '') name = f'{name}__{i}__' else: break ds_meta_dir = os.path.join(self.dir, name) os.mkdir(ds_meta_dir) ds_dir = os.path.join(ds_meta_dir, 'datasource') os.mkdir(ds_dir) if source_type == 'file': try: source = os.path.join(ds_dir, source) shutil.move(file_path, source) ds = FileDS(source) except Exception: shutil.rmtree(ds_meta_dir) raise picklable = {'class': 'FileDS', 'args': [source], 'kwargs': {}} elif source_type in self.config['integrations']: integration = self.config['integrations'][source_type] dsClass = None picklable = { 'args': [], 'kwargs': { 'query': source, 'user': integration['user'], 'password': integration['password'], 'host': integration['host'], 'port': integration['port'] } } if integration['type'] == 'clickhouse': dsClass = ClickhouseDS picklable['class'] = 'ClickhouseDS' elif integration['type'] == 'mariadb': dsClass = MariaDS picklable['class'] = 'MariaDS' elif integration['type'] == 'mysql': dsClass = MySqlDS picklable['class'] = 'MySqlDS' elif integration['type'] == 'postgres': dsClass = PostgresDS picklable['class'] = 'PostgresDS' elif integration['type'] == 'mssql': dsClass = MSSQLDS picklable['class'] = 'MSSQLDS' else: raise ValueError(f'Unknown DS source_type: {source_type}') try: ds = dsClass(query=source, user=integration['user'], password=integration['password'], host=integration['host'], port=integration['port']) except Exception: shutil.rmtree(ds_meta_dir) raise else: # This probably only happens for urls print('Create URL data source !') try: ds = FileDS(source) except Exception: shutil.rmtree(ds_meta_dir) raise picklable = {'class': 'FileDS', 'args': [source], 'kwargs': {}} df = ds.df df_with_types = cast_df_columns_types( df, self.get_analysis(df)['data_analysis_v2']) create_sqlite_db(os.path.join(ds_dir, 'sqlite.db'), df_with_types) with open(os.path.join(ds_dir, 'ds.pickle'), 'wb') as fp: pickle.dump(picklable, fp) with open(os.path.join(ds_dir, 'metadata.json'), 'w') as fp: meta = { 'name': name, 'source_type': source_type, 'source': source, 'created_at': str(datetime.datetime.now()).split('.')[0], 'updated_at': str(datetime.datetime.now()).split('.')[0], 'row_count': len(df), 'columns': [dict(name=x) for x in list(df.keys())] } json.dump(meta, fp) return self.get_datasource_obj(name, raw=True), name def get_datasource_obj(self, name, raw=False): ds_meta_dir = os.path.join(self.dir, name) ds_dir = os.path.join(ds_meta_dir, 'datasource') ds = None try: with open(os.path.join(ds_dir, 'ds.pickle'), 'rb') as fp: picklable = pickle.load(fp) if raw: return picklable try: ds = eval(picklable['class'])(*picklable['args'], **picklable['kwargs']) except Exception: ds = picklable return ds except Exception as e: print(f'\n{e}\n') return None
class CustomModels(): def __init__(self, config): self.config = config self.dbw = DatabaseWrapper(self.config) self.storage_dir = os.path.join(config['storage_dir'], 'misc') os.makedirs(self.storage_dir, exist_ok=True) self.model_cache = {} self.mindsdb_native = MindsdbNative(self.config) self.dbw = DatabaseWrapper(self.config) def _dir(self, name): return str(os.path.join(self.storage_dir, 'custom_model_' + name)) def _internal_load(self, name): # Caching (2 lines bellow), currently disabled due to multiprocessing cache invalidation issues #if name in self.model_cache: # return self.model_cache[name] # "Proper" model loading (3 lines bellow), currently disabled due to pickling issues #spec = importlib.util.spec_from_file_location(name, self._dir(name) + '/model.py') #module = importlib.util.module_from_spec(spec) #spec.loader.exec_module(module) sys.path.insert(0, self._dir(name)) module = __import__(name) try: model = module.Model.load( os.path.join(self._dir(name), 'model.pickle')) except Exception as e: model = module.Model() model.initialize_column_types() if hasattr(model, 'setup'): model.setup() self.model_cache[name] = model return model def learn(self, name, from_data, to_predict, kwargs={}): model_data = self.get_model_data(name) model_data['status'] = 'training' self.save_model_data(name, model_data) to_predict = to_predict if isinstance(to_predict, list) else [to_predict] data_source = getattr(mindsdb_native, from_data['class'])(*from_data['args'], **from_data['kwargs']) data_frame = data_source.df model = self._internal_load(name) model.to_predict = to_predict model_data = self.get_model_data(name) model_data['predict'] = model.to_predict self.save_model_data(name, model_data) data_analysis = self.mindsdb_native.analyse_dataset( data_source)['data_analysis_v2'] model_data = self.get_model_data(name) model_data['data_analysis'] = data_analysis self.save_model_data(name, model_data) model.fit(data_frame, to_predict, data_analysis, kwargs) model.save(os.path.join(self._dir(name), 'model.pickle')) self.model_cache[name] = model model_data = self.get_model_data(name) model_data['status'] = 'completed' self.save_model_data(name, model_data) self.dbw.unregister_predictor(name) self.dbw.register_predictors([self.get_model_data(name)], setup=False) def predict(self, name, when_data=None, from_data=None, kwargs={}): if from_data is not None: data_source = getattr(mindsdb_native, from_data['class'])(*from_data['args'], **from_data['kwargs']) data_frame = data_source.df elif when_data is not None: if isinstance(when_data, dict): for k in when_data: when_data[k] = [when_data[k]] data_frame = pd.DataFrame(when_data) else: data_frame = pd.DataFrame(when_data) model = self._internal_load(name) predictions = model.predict(data_frame, kwargs) pred_arr = [] for i in range(len(predictions)): pred_arr.append({}) pred_arr[-1] = {} for col in predictions.columns: pred_arr[-1][col] = {} pred_arr[-1][col]['predicted_value'] = predictions[col].iloc[i] return pred_arr def get_model_data(self, name): with open(os.path.join(self._dir(name), 'metadata.json'), 'r') as fp: return json.load(fp) def save_model_data(self, name, data): with open(os.path.join(self._dir(name), 'metadata.json'), 'w') as fp: json.dump(data, fp) def get_models(self, status='any'): models = [] for model_dir in os.listdir(self.storage_dir): if 'custom_model_' in model_dir: name = model_dir.replace('custom_model_', '') try: models.append(self.get_model_data(name)) except: print(f'Model {name} not found !') return models def delete_model(self, name): shutil.rmtree(self._dir(name)) self.dbw.unregister_predictor(name) def rename_model(self, name, new_name): self.dbw.unregister_predictor(name) shutil.move(self._dir(name), self._dir(new_name)) shutil.move(os.path.join(self._dir(new_name) + f'{name}.py'), os.path.join(self._dir(new_name), f'{new_name}.py')) self.dbw.register_predictors([self.get_model_data(new_name)], setup=False) def export_model(self, name): shutil.make_archive(base_name=name, format='zip', root_dir=self._dir(name)) return str(self._dir(name)) + '.zip' def load_model(self, fpath, name, trained_status): shutil.unpack_archive(fpath, self._dir(name), 'zip') shutil.move(os.path.join(self._dir(name), 'model.py'), os.path.join(self._dir(name), f'{name}.py')) model = self._internal_load(name) model.to_predict = model.to_predict if isinstance( model.to_predict, list) else [model.to_predict] self.save_model_data( name, { 'name': name, 'data_analysis': model.column_type_map, 'predict': model.to_predict, 'status': trained_status, 'is_custom': True }) with open(os.path.join(self._dir(name), '__init__.py'), 'w') as fp: fp.write('') if trained_status == 'trained': self.dbw.register_predictors([self.get_model_data(name)], setup=False)
class DataStore(): def __init__(self, config): self.config = config self.dir = config.paths['datasources'] self.mindsdb_native = MindsdbNative(config) def get_analysis(self, ds): if isinstance(ds, str): return self.mindsdb_native.analyse_dataset( self.get_datasource_obj(ds)) else: return self.mindsdb_native.analyse_dataset(ds) def get_datasources(self): datasource_arr = [] for ds_name in os.listdir(self.dir): try: with open(os.path.join(self.dir, ds_name, 'metadata.json'), 'r') as fp: try: datasource = json.load(fp) datasource['created_at'] = parse_dt( datasource['created_at'].split('.')[0]) datasource['updated_at'] = parse_dt( datasource['updated_at'].split('.')[0]) datasource_arr.append(datasource) except Exception as e: print(e) except Exception as e: print(e) return datasource_arr def get_data(self, name, where=None, limit=None, offset=None): if offset is None: offset = 0 ds = self.get_datasource_obj(name) # @TODO Remove and add `offset` to the `filter` method of the datasource if limit is not None: filtered_ds = ds.filter(where=where, limit=limit + offset) else: filtered_ds = ds.filter(where=where) filtered_ds = filtered_ds.iloc[offset:] filtered_ds = filtered_ds.where(pd.notnull(filtered_ds), None) data = filtered_ds.to_dict(orient='records') return { 'data': data, 'rowcount': len(ds), 'columns_names': filtered_ds.columns } def get_datasource(self, name): for ds in self.get_datasources(): if ds['name'] == name: return ds return None def delete_datasource(self, name): shutil.rmtree(os.path.join(self.dir, name)) def save_datasource(self, name, source_type, source, file_path=None): if source_type == 'file' and (file_path is None): raise Exception( '`file_path` argument required when source_type == "file"') for i in range(1, 1000): if name in [x['name'] for x in self.get_datasources()]: previous_index = i - 1 name = name.replace(f'__{previous_index}__', '') name = f'{name}__{i}__' else: break ds_meta_dir = os.path.join(self.dir, name) os.mkdir(ds_meta_dir) try: if source_type == 'file': source = os.path.join(ds_meta_dir, source) shutil.move(file_path, source) ds = FileDS(source) picklable = {'class': 'FileDS', 'args': [source], 'kwargs': {}} elif source_type in self.config['integrations']: integration = self.config['integrations'][source_type] ds_class_map = { 'clickhouse': ClickhouseDS, 'mariadb': MariaDS, 'mysql': MySqlDS, 'postgres': PostgresDS, 'mssql': MSSQLDS, 'mongodb': MongoDS, 'snowflake': SnowflakeDS } try: dsClass = ds_class_map[integration['type']] except KeyError: raise KeyError( f"Unknown DS type: {source_type}, type is {integration['type']}" ) if integration['type'] in ['clickhouse']: picklable = { 'class': dsClass.__name__, 'args': [], 'kwargs': { 'query': source['query'], 'user': integration['user'], 'password': integration['password'], 'host': integration['host'], 'port': integration['port'] } } ds = dsClass(**picklable['kwargs']) elif integration['type'] in [ 'mssql', 'postgres', 'mariadb', 'mysql' ]: picklable = { 'class': dsClass.__name__, 'args': [], 'kwargs': { 'query': source['query'], 'user': integration['user'], 'password': integration['password'], 'host': integration['host'], 'port': integration['port'] } } if 'database' in integration: picklable['kwargs']['database'] = integration[ 'database'] if 'database' in source: picklable['kwargs']['database'] = source['database'] ds = dsClass(**picklable['kwargs']) elif integration['type'] == 'snowflake': picklable = { 'class': dsClass.__name__, 'args': [], 'kwargs': { 'query': source['query'], 'schema': source['schema'], 'warehouse': source['warehouse'], 'database': source['database'], 'host': integration['host'], 'password': integration['password'], 'user': integration['user'], 'account': integration['account'] } } ds = dsClass(**picklable['kwargs']) elif integration['type'] == 'mongodb': picklable = { 'class': dsClass.__name__, 'args': [], 'kwargs': { 'database': source['database'], 'collection': source['collection'], 'query': source['find'], 'user': integration['user'], 'password': integration['password'], 'host': integration['host'], 'port': integration['port'] } } ds = dsClass(**picklable['kwargs']) else: # This probably only happens for urls ds = FileDS(source) picklable = {'class': 'FileDS', 'args': [source], 'kwargs': {}} df = ds.df if '' in df.columns or len(df.columns) != len(set(df.columns)): shutil.rmtree(ds_meta_dir) raise Exception( 'Each column in datasource must have unique name') # Not sure if needed #summary_analysis = self.get_analysis(ds.filter(limit=200))['data_analysis_v2'] with open(os.path.join(ds_meta_dir, 'ds.pickle'), 'wb') as fp: pickle.dump(picklable, fp) with open(os.path.join(ds_meta_dir, 'metadata.json'), 'w') as fp: meta = { 'name': name, 'source_type': source_type, 'source': source, 'created_at': str(datetime.datetime.now()).split('.')[0], 'updated_at': str(datetime.datetime.now()).split('.')[0], 'row_count': len(df), 'columns': [dict(name=x) for x in list(df.keys())] } json.dump(meta, fp, indent=4, sort_keys=True) with open(os.path.join(ds_meta_dir, 'versions.json'), 'wt') as fp: json.dump(self.config.versions, fp, indent=4, sort_keys=True) except Exception: if os.path.isdir(ds_meta_dir): shutil.rmtree(ds_meta_dir) raise return self.get_datasource_obj(name, raw=True), name def get_datasource_obj(self, name, raw=False): ds_meta_dir = os.path.join(self.dir, name) ds = None try: with open(os.path.join(ds_meta_dir, 'ds.pickle'), 'rb') as fp: picklable = pickle.load(fp) if raw: return picklable try: ds = eval(picklable['class'])(*picklable['args'], **picklable['kwargs']) except Exception: ds = picklable return ds except Exception as e: print(f'\n{e}\n') return None
class DataStore(): def __init__(self, config, storage_dir=None): self.config = config self.dir = storage_dir if isinstance( storage_dir, str) else config['interface']['datastore']['storage_dir'] self.mindsdb_native = MindsdbNative(config) def get_analysis(self, ds): try: return self.mindsdb_native.analyse_dataset(ds) except: return self.mindsdb_native.analyse_dataset( self.get_datasource_obj(ds)) def get_datasources(self): datasource_arr = [] for ds_name in os.listdir(self.dir): try: with open( os.path.join(self.dir, ds_name, 'datasource', 'metadata.json'), 'r') as fp: try: datasource = json.load(fp) datasource['created_at'] = parse_dt( datasource['created_at'].split('.')[0]) datasource['updated_at'] = parse_dt( datasource['updated_at'].split('.')[0]) datasource_arr.append(datasource) except Exception as e: print(e) except Exception as e: print(e) return datasource_arr def get_data(self, name, where=None, limit=None, offset=None): # @TODO Apply filter directly to postgres/mysql/clickhouse/etc... when the datasource is of that type return get_sqlite_data(os.path.join(self.dir, name, 'datasource', 'sqlite.db'), where=where, limit=limit, offset=offset) def get_datasource(self, name): for ds in self.get_datasources(): if ds['name'] == name: return ds return None def delete_datasource(self, name): data_sources = self.get_datasource(name) shutil.rmtree(os.path.join(self.dir, data_sources['name'])) def save_datasource(self, name, source_type, source, file_path=None): print(name, source_type, source) if source_type == 'file' and (file_path is None): raise Exception( '`file_path` argument required when source_type == "file"') for i in range(1, 1000): if name in [x['name'] for x in self.get_datasources()]: previous_index = i - 1 name = name.replace(f'__{previous_index}__', '') name = f'{name}__{i}__' else: break ds_meta_dir = os.path.join(self.dir, name) os.mkdir(ds_meta_dir) ds_dir = os.path.join(ds_meta_dir, 'datasource') os.mkdir(ds_dir) print(source_type) if source_type == 'file': source = os.path.join(ds_dir, source) os.replace(file_path, source) ds = FileDS(source) picklable = {'class': 'FileDS', 'args': [source], 'kwargs': {}} elif source_type == 'clickhouse': user = self.config['integrations']['default_clickhouse']['user'] password = self.config['integrations']['default_clickhouse'][ 'password'] # TODO add host port params ds = ClickhouseDS(source, user=user, password=password) picklable = { 'class': 'ClickhouseDS', 'args': [source], 'kwargs': { 'user': user, 'password': password } } elif source_type == 'mariadb': user = self.config['integrations']['default_mariadb']['user'] password = self.config['integrations']['default_mariadb'][ 'password'] host = self.config['integrations']['default_mariadb']['host'] port = self.config['integrations']['default_mariadb']['port'] ds = MariaDS(source, user=user, password=password, host=host, port=port) picklable = { 'class': 'MariaDS', 'args': [source], 'kwargs': { 'user': user, 'password': password, 'host': host, 'port': port } } else: # This probably only happens for urls print('Create URL data source !') ds = FileDS(source) picklable = {'class': 'FileDS', 'args': [source], 'kwargs': {}} df = ds.df df_with_types = cast_df_columns_types( df, self.get_analysis(df)['data_analysis_v2']) create_sqlite_db(os.path.join(ds_dir, 'sqlite.db'), df_with_types) print(picklable) with open(os.path.join(ds_dir, 'ds.pickle'), 'wb') as fp: pickle.dump(picklable, fp) with open(os.path.join(ds_dir, 'metadata.json'), 'w') as fp: json.dump( { 'name': name, 'source_type': source_type, 'source': source, 'created_at': str(datetime.datetime.now()).split('.')[0], 'updated_at': str(datetime.datetime.now()).split('.')[0], 'row_count': len(df), 'columns': [dict(name=x) for x in list(df.keys())] }, fp) return self.get_datasource_obj(name, avoid_crash=True) def get_datasource_obj(self, name, avoid_crash=False): ds_meta_dir = os.path.join(self.dir, name) ds_dir = os.path.join(ds_meta_dir, 'datasource') ds = None try: #resource.setrlimit(resource.RLIMIT_STACK, [0x10000000, resource.RLIM_INFINITY]) #sys.setrecursionlimit(0x100000) with open(os.path.join(ds_dir, 'ds.pickle'), 'rb') as fp: picklable = pickle.load(fp) if avoid_crash: return picklable try: ds = eval(picklable['class'])(*picklable['args'], **picklable['kwargs']) except: ds = picklable return ds except Exception as e: print(f'\n{e}\n') return None