Ejemplo n.º 1
0
class DataStore():
    def __init__(self, config, storage_dir=None):
        self.config = config
        self.dir = storage_dir if isinstance(
            storage_dir, str) else config.paths['datasources']
        self.mindsdb_native = MindsdbNative(config)

    def get_analysis(self, ds):
        if isinstance(ds, str):
            return self.mindsdb_native.analyse_dataset(
                self.get_datasource_obj(ds))
        else:
            return self.mindsdb_native.analyse_dataset(ds)

    def get_datasources(self):
        datasource_arr = []
        for ds_name in os.listdir(self.dir):
            try:
                with open(
                        os.path.join(self.dir, ds_name, 'datasource',
                                     'metadata.json'), 'r') as fp:
                    try:
                        datasource = json.load(fp)
                        datasource['created_at'] = parse_dt(
                            datasource['created_at'].split('.')[0])
                        datasource['updated_at'] = parse_dt(
                            datasource['updated_at'].split('.')[0])
                        datasource_arr.append(datasource)
                    except Exception as e:
                        print(e)
            except Exception as e:
                print(e)
        return datasource_arr

    def get_data(self, name, where=None, limit=None, offset=None):
        # @TODO Apply filter directly to postgres/mysql/clickhouse/etc...  when the datasource is of that type
        return get_sqlite_data(os.path.join(self.dir, name, 'datasource',
                                            'sqlite.db'),
                               where=where,
                               limit=limit,
                               offset=offset)

    def get_datasource(self, name):
        for ds in self.get_datasources():
            if ds['name'] == name:
                return ds
        return None

    def delete_datasource(self, name):
        data_sources = self.get_datasource(name)
        shutil.rmtree(os.path.join(self.dir, data_sources['name']))

    def save_datasource(self, name, source_type, source, file_path=None):
        if source_type == 'file' and (file_path is None):
            raise Exception(
                '`file_path` argument required when source_type == "file"')

        for i in range(1, 1000):
            if name in [x['name'] for x in self.get_datasources()]:
                previous_index = i - 1
                name = name.replace(f'__{previous_index}__', '')
                name = f'{name}__{i}__'
            else:
                break

        ds_meta_dir = os.path.join(self.dir, name)
        os.mkdir(ds_meta_dir)

        ds_dir = os.path.join(ds_meta_dir, 'datasource')
        os.mkdir(ds_dir)

        if source_type == 'file':
            try:
                source = os.path.join(ds_dir, source)
                shutil.move(file_path, source)
                ds = FileDS(source)
            except Exception:
                shutil.rmtree(ds_meta_dir)
                raise

            picklable = {'class': 'FileDS', 'args': [source], 'kwargs': {}}
        elif source_type in self.config['integrations']:
            integration = self.config['integrations'][source_type]
            dsClass = None
            picklable = {
                'args': [],
                'kwargs': {
                    'query': source,
                    'user': integration['user'],
                    'password': integration['password'],
                    'host': integration['host'],
                    'port': integration['port']
                }
            }
            if integration['type'] == 'clickhouse':
                dsClass = ClickhouseDS
                picklable['class'] = 'ClickhouseDS'
            elif integration['type'] == 'mariadb':
                dsClass = MariaDS
                picklable['class'] = 'MariaDS'
            elif integration['type'] == 'mysql':
                dsClass = MySqlDS
                picklable['class'] = 'MySqlDS'
            elif integration['type'] == 'postgres':
                dsClass = PostgresDS
                picklable['class'] = 'PostgresDS'
            elif integration['type'] == 'mssql':
                dsClass = MSSQLDS
                picklable['class'] = 'MSSQLDS'
            else:
                raise ValueError(f'Unknown DS source_type: {source_type}')
            try:
                ds = dsClass(query=source,
                             user=integration['user'],
                             password=integration['password'],
                             host=integration['host'],
                             port=integration['port'])
            except Exception:
                shutil.rmtree(ds_meta_dir)
                raise
        else:
            # This probably only happens for urls
            print('Create URL data source !')
            try:
                ds = FileDS(source)
            except Exception:
                shutil.rmtree(ds_meta_dir)
                raise
            picklable = {'class': 'FileDS', 'args': [source], 'kwargs': {}}

        df = ds.df

        df_with_types = cast_df_columns_types(
            df,
            self.get_analysis(df)['data_analysis_v2'])
        create_sqlite_db(os.path.join(ds_dir, 'sqlite.db'), df_with_types)

        with open(os.path.join(ds_dir, 'ds.pickle'), 'wb') as fp:
            pickle.dump(picklable, fp)

        with open(os.path.join(ds_dir, 'metadata.json'), 'w') as fp:
            meta = {
                'name': name,
                'source_type': source_type,
                'source': source,
                'created_at': str(datetime.datetime.now()).split('.')[0],
                'updated_at': str(datetime.datetime.now()).split('.')[0],
                'row_count': len(df),
                'columns': [dict(name=x) for x in list(df.keys())]
            }
            json.dump(meta, fp)

        return self.get_datasource_obj(name, raw=True), name

    def get_datasource_obj(self, name, raw=False):
        ds_meta_dir = os.path.join(self.dir, name)
        ds_dir = os.path.join(ds_meta_dir, 'datasource')
        ds = None
        try:
            with open(os.path.join(ds_dir, 'ds.pickle'), 'rb') as fp:
                picklable = pickle.load(fp)
                if raw:
                    return picklable
                try:
                    ds = eval(picklable['class'])(*picklable['args'],
                                                  **picklable['kwargs'])
                except Exception:
                    ds = picklable
            return ds
        except Exception as e:
            print(f'\n{e}\n')
            return None
Ejemplo n.º 2
0
class CustomModels():
    def __init__(self, config):
        self.config = config
        self.dbw = DatabaseWrapper(self.config)
        self.storage_dir = os.path.join(config['storage_dir'], 'misc')
        os.makedirs(self.storage_dir, exist_ok=True)
        self.model_cache = {}
        self.mindsdb_native = MindsdbNative(self.config)
        self.dbw = DatabaseWrapper(self.config)

    def _dir(self, name):
        return str(os.path.join(self.storage_dir, 'custom_model_' + name))

    def _internal_load(self, name):

        # Caching (2 lines bellow), currently disabled due to multiprocessing cache invalidation issues
        #if name in self.model_cache:
        #    return self.model_cache[name]

        # "Proper" model loading (3 lines bellow), currently disabled due to pickling issues
        #spec = importlib.util.spec_from_file_location(name, self._dir(name) + '/model.py')
        #module = importlib.util.module_from_spec(spec)
        #spec.loader.exec_module(module)

        sys.path.insert(0, self._dir(name))
        module = __import__(name)

        try:
            model = module.Model.load(
                os.path.join(self._dir(name), 'model.pickle'))
        except Exception as e:
            model = module.Model()
            model.initialize_column_types()
            if hasattr(model, 'setup'):
                model.setup()

        self.model_cache[name] = model

        return model

    def learn(self, name, from_data, to_predict, kwargs={}):
        model_data = self.get_model_data(name)
        model_data['status'] = 'training'
        self.save_model_data(name, model_data)

        to_predict = to_predict if isinstance(to_predict,
                                              list) else [to_predict]
        data_source = getattr(mindsdb_native,
                              from_data['class'])(*from_data['args'],
                                                  **from_data['kwargs'])
        data_frame = data_source.df
        model = self._internal_load(name)
        model.to_predict = to_predict

        model_data = self.get_model_data(name)
        model_data['predict'] = model.to_predict
        self.save_model_data(name, model_data)

        data_analysis = self.mindsdb_native.analyse_dataset(
            data_source)['data_analysis_v2']

        model_data = self.get_model_data(name)
        model_data['data_analysis'] = data_analysis
        self.save_model_data(name, model_data)

        model.fit(data_frame, to_predict, data_analysis, kwargs)

        model.save(os.path.join(self._dir(name), 'model.pickle'))
        self.model_cache[name] = model

        model_data = self.get_model_data(name)
        model_data['status'] = 'completed'
        self.save_model_data(name, model_data)

        self.dbw.unregister_predictor(name)
        self.dbw.register_predictors([self.get_model_data(name)], setup=False)

    def predict(self, name, when_data=None, from_data=None, kwargs={}):
        if from_data is not None:
            data_source = getattr(mindsdb_native,
                                  from_data['class'])(*from_data['args'],
                                                      **from_data['kwargs'])
            data_frame = data_source.df
        elif when_data is not None:
            if isinstance(when_data, dict):
                for k in when_data:
                    when_data[k] = [when_data[k]]
                data_frame = pd.DataFrame(when_data)
            else:
                data_frame = pd.DataFrame(when_data)

        model = self._internal_load(name)
        predictions = model.predict(data_frame, kwargs)

        pred_arr = []
        for i in range(len(predictions)):
            pred_arr.append({})
            pred_arr[-1] = {}
            for col in predictions.columns:
                pred_arr[-1][col] = {}
                pred_arr[-1][col]['predicted_value'] = predictions[col].iloc[i]

        return pred_arr

    def get_model_data(self, name):
        with open(os.path.join(self._dir(name), 'metadata.json'), 'r') as fp:
            return json.load(fp)

    def save_model_data(self, name, data):
        with open(os.path.join(self._dir(name), 'metadata.json'), 'w') as fp:
            json.dump(data, fp)

    def get_models(self, status='any'):
        models = []
        for model_dir in os.listdir(self.storage_dir):
            if 'custom_model_' in model_dir:
                name = model_dir.replace('custom_model_', '')
                try:
                    models.append(self.get_model_data(name))
                except:
                    print(f'Model {name} not found !')

        return models

    def delete_model(self, name):
        shutil.rmtree(self._dir(name))
        self.dbw.unregister_predictor(name)

    def rename_model(self, name, new_name):
        self.dbw.unregister_predictor(name)
        shutil.move(self._dir(name), self._dir(new_name))
        shutil.move(os.path.join(self._dir(new_name) + f'{name}.py'),
                    os.path.join(self._dir(new_name), f'{new_name}.py'))
        self.dbw.register_predictors([self.get_model_data(new_name)],
                                     setup=False)

    def export_model(self, name):
        shutil.make_archive(base_name=name,
                            format='zip',
                            root_dir=self._dir(name))
        return str(self._dir(name)) + '.zip'

    def load_model(self, fpath, name, trained_status):
        shutil.unpack_archive(fpath, self._dir(name), 'zip')
        shutil.move(os.path.join(self._dir(name), 'model.py'),
                    os.path.join(self._dir(name), f'{name}.py'))
        model = self._internal_load(name)
        model.to_predict = model.to_predict if isinstance(
            model.to_predict, list) else [model.to_predict]
        self.save_model_data(
            name, {
                'name': name,
                'data_analysis': model.column_type_map,
                'predict': model.to_predict,
                'status': trained_status,
                'is_custom': True
            })

        with open(os.path.join(self._dir(name), '__init__.py'), 'w') as fp:
            fp.write('')

        if trained_status == 'trained':
            self.dbw.register_predictors([self.get_model_data(name)],
                                         setup=False)
Ejemplo n.º 3
0
class DataStore():
    def __init__(self, config):
        self.config = config
        self.dir = config.paths['datasources']
        self.mindsdb_native = MindsdbNative(config)

    def get_analysis(self, ds):
        if isinstance(ds, str):
            return self.mindsdb_native.analyse_dataset(
                self.get_datasource_obj(ds))
        else:
            return self.mindsdb_native.analyse_dataset(ds)

    def get_datasources(self):
        datasource_arr = []
        for ds_name in os.listdir(self.dir):
            try:
                with open(os.path.join(self.dir, ds_name, 'metadata.json'),
                          'r') as fp:
                    try:
                        datasource = json.load(fp)
                        datasource['created_at'] = parse_dt(
                            datasource['created_at'].split('.')[0])
                        datasource['updated_at'] = parse_dt(
                            datasource['updated_at'].split('.')[0])
                        datasource_arr.append(datasource)
                    except Exception as e:
                        print(e)
            except Exception as e:
                print(e)
        return datasource_arr

    def get_data(self, name, where=None, limit=None, offset=None):
        if offset is None:
            offset = 0

        ds = self.get_datasource_obj(name)

        # @TODO Remove and add `offset` to the `filter` method of the datasource
        if limit is not None:
            filtered_ds = ds.filter(where=where, limit=limit + offset)
        else:
            filtered_ds = ds.filter(where=where)

        filtered_ds = filtered_ds.iloc[offset:]

        filtered_ds = filtered_ds.where(pd.notnull(filtered_ds), None)

        data = filtered_ds.to_dict(orient='records')
        return {
            'data': data,
            'rowcount': len(ds),
            'columns_names': filtered_ds.columns
        }

    def get_datasource(self, name):
        for ds in self.get_datasources():
            if ds['name'] == name:
                return ds
        return None

    def delete_datasource(self, name):
        shutil.rmtree(os.path.join(self.dir, name))

    def save_datasource(self, name, source_type, source, file_path=None):
        if source_type == 'file' and (file_path is None):
            raise Exception(
                '`file_path` argument required when source_type == "file"')

        for i in range(1, 1000):
            if name in [x['name'] for x in self.get_datasources()]:
                previous_index = i - 1
                name = name.replace(f'__{previous_index}__', '')
                name = f'{name}__{i}__'
            else:
                break

        ds_meta_dir = os.path.join(self.dir, name)
        os.mkdir(ds_meta_dir)

        try:
            if source_type == 'file':
                source = os.path.join(ds_meta_dir, source)
                shutil.move(file_path, source)
                ds = FileDS(source)

                picklable = {'class': 'FileDS', 'args': [source], 'kwargs': {}}

            elif source_type in self.config['integrations']:
                integration = self.config['integrations'][source_type]

                ds_class_map = {
                    'clickhouse': ClickhouseDS,
                    'mariadb': MariaDS,
                    'mysql': MySqlDS,
                    'postgres': PostgresDS,
                    'mssql': MSSQLDS,
                    'mongodb': MongoDS,
                    'snowflake': SnowflakeDS
                }

                try:
                    dsClass = ds_class_map[integration['type']]
                except KeyError:
                    raise KeyError(
                        f"Unknown DS type: {source_type}, type is {integration['type']}"
                    )

                if integration['type'] in ['clickhouse']:
                    picklable = {
                        'class': dsClass.__name__,
                        'args': [],
                        'kwargs': {
                            'query': source['query'],
                            'user': integration['user'],
                            'password': integration['password'],
                            'host': integration['host'],
                            'port': integration['port']
                        }
                    }
                    ds = dsClass(**picklable['kwargs'])

                elif integration['type'] in [
                        'mssql', 'postgres', 'mariadb', 'mysql'
                ]:
                    picklable = {
                        'class': dsClass.__name__,
                        'args': [],
                        'kwargs': {
                            'query': source['query'],
                            'user': integration['user'],
                            'password': integration['password'],
                            'host': integration['host'],
                            'port': integration['port']
                        }
                    }

                    if 'database' in integration:
                        picklable['kwargs']['database'] = integration[
                            'database']

                    if 'database' in source:
                        picklable['kwargs']['database'] = source['database']

                    ds = dsClass(**picklable['kwargs'])

                elif integration['type'] == 'snowflake':
                    picklable = {
                        'class': dsClass.__name__,
                        'args': [],
                        'kwargs': {
                            'query': source['query'],
                            'schema': source['schema'],
                            'warehouse': source['warehouse'],
                            'database': source['database'],
                            'host': integration['host'],
                            'password': integration['password'],
                            'user': integration['user'],
                            'account': integration['account']
                        }
                    }

                    ds = dsClass(**picklable['kwargs'])

                elif integration['type'] == 'mongodb':
                    picklable = {
                        'class': dsClass.__name__,
                        'args': [],
                        'kwargs': {
                            'database': source['database'],
                            'collection': source['collection'],
                            'query': source['find'],
                            'user': integration['user'],
                            'password': integration['password'],
                            'host': integration['host'],
                            'port': integration['port']
                        }
                    }

                    ds = dsClass(**picklable['kwargs'])
            else:
                # This probably only happens for urls
                ds = FileDS(source)
                picklable = {'class': 'FileDS', 'args': [source], 'kwargs': {}}

            df = ds.df

            if '' in df.columns or len(df.columns) != len(set(df.columns)):
                shutil.rmtree(ds_meta_dir)
                raise Exception(
                    'Each column in datasource must have unique name')

            # Not sure if needed
            #summary_analysis = self.get_analysis(ds.filter(limit=200))['data_analysis_v2']

            with open(os.path.join(ds_meta_dir, 'ds.pickle'), 'wb') as fp:
                pickle.dump(picklable, fp)

            with open(os.path.join(ds_meta_dir, 'metadata.json'), 'w') as fp:
                meta = {
                    'name': name,
                    'source_type': source_type,
                    'source': source,
                    'created_at': str(datetime.datetime.now()).split('.')[0],
                    'updated_at': str(datetime.datetime.now()).split('.')[0],
                    'row_count': len(df),
                    'columns': [dict(name=x) for x in list(df.keys())]
                }
                json.dump(meta, fp, indent=4, sort_keys=True)

            with open(os.path.join(ds_meta_dir, 'versions.json'), 'wt') as fp:
                json.dump(self.config.versions, fp, indent=4, sort_keys=True)

        except Exception:
            if os.path.isdir(ds_meta_dir):
                shutil.rmtree(ds_meta_dir)
            raise

        return self.get_datasource_obj(name, raw=True), name

    def get_datasource_obj(self, name, raw=False):
        ds_meta_dir = os.path.join(self.dir, name)
        ds = None
        try:
            with open(os.path.join(ds_meta_dir, 'ds.pickle'), 'rb') as fp:
                picklable = pickle.load(fp)
                if raw:
                    return picklable
                try:
                    ds = eval(picklable['class'])(*picklable['args'],
                                                  **picklable['kwargs'])
                except Exception:
                    ds = picklable
            return ds
        except Exception as e:
            print(f'\n{e}\n')
            return None
Ejemplo n.º 4
0
class DataStore():
    def __init__(self, config, storage_dir=None):
        self.config = config
        self.dir = storage_dir if isinstance(
            storage_dir,
            str) else config['interface']['datastore']['storage_dir']
        self.mindsdb_native = MindsdbNative(config)

    def get_analysis(self, ds):
        try:
            return self.mindsdb_native.analyse_dataset(ds)
        except:
            return self.mindsdb_native.analyse_dataset(
                self.get_datasource_obj(ds))

    def get_datasources(self):
        datasource_arr = []
        for ds_name in os.listdir(self.dir):
            try:
                with open(
                        os.path.join(self.dir, ds_name, 'datasource',
                                     'metadata.json'), 'r') as fp:
                    try:
                        datasource = json.load(fp)
                        datasource['created_at'] = parse_dt(
                            datasource['created_at'].split('.')[0])
                        datasource['updated_at'] = parse_dt(
                            datasource['updated_at'].split('.')[0])
                        datasource_arr.append(datasource)
                    except Exception as e:
                        print(e)
            except Exception as e:
                print(e)
        return datasource_arr

    def get_data(self, name, where=None, limit=None, offset=None):
        # @TODO Apply filter directly to postgres/mysql/clickhouse/etc...  when the datasource is of that type
        return get_sqlite_data(os.path.join(self.dir, name, 'datasource',
                                            'sqlite.db'),
                               where=where,
                               limit=limit,
                               offset=offset)

    def get_datasource(self, name):
        for ds in self.get_datasources():
            if ds['name'] == name:
                return ds
        return None

    def delete_datasource(self, name):
        data_sources = self.get_datasource(name)
        shutil.rmtree(os.path.join(self.dir, data_sources['name']))

    def save_datasource(self, name, source_type, source, file_path=None):
        print(name, source_type, source)
        if source_type == 'file' and (file_path is None):
            raise Exception(
                '`file_path` argument required when source_type == "file"')

        for i in range(1, 1000):
            if name in [x['name'] for x in self.get_datasources()]:
                previous_index = i - 1
                name = name.replace(f'__{previous_index}__', '')
                name = f'{name}__{i}__'
            else:
                break

        ds_meta_dir = os.path.join(self.dir, name)
        os.mkdir(ds_meta_dir)

        ds_dir = os.path.join(ds_meta_dir, 'datasource')
        os.mkdir(ds_dir)

        print(source_type)
        if source_type == 'file':
            source = os.path.join(ds_dir, source)
            os.replace(file_path, source)
            ds = FileDS(source)
            picklable = {'class': 'FileDS', 'args': [source], 'kwargs': {}}
        elif source_type == 'clickhouse':
            user = self.config['integrations']['default_clickhouse']['user']
            password = self.config['integrations']['default_clickhouse'][
                'password']
            # TODO add host port params
            ds = ClickhouseDS(source, user=user, password=password)
            picklable = {
                'class': 'ClickhouseDS',
                'args': [source],
                'kwargs': {
                    'user': user,
                    'password': password
                }
            }
        elif source_type == 'mariadb':
            user = self.config['integrations']['default_mariadb']['user']
            password = self.config['integrations']['default_mariadb'][
                'password']
            host = self.config['integrations']['default_mariadb']['host']
            port = self.config['integrations']['default_mariadb']['port']
            ds = MariaDS(source,
                         user=user,
                         password=password,
                         host=host,
                         port=port)
            picklable = {
                'class': 'MariaDS',
                'args': [source],
                'kwargs': {
                    'user': user,
                    'password': password,
                    'host': host,
                    'port': port
                }
            }
        else:
            # This probably only happens for urls
            print('Create URL data source !')
            ds = FileDS(source)
            picklable = {'class': 'FileDS', 'args': [source], 'kwargs': {}}

        df = ds.df

        df_with_types = cast_df_columns_types(
            df,
            self.get_analysis(df)['data_analysis_v2'])
        create_sqlite_db(os.path.join(ds_dir, 'sqlite.db'), df_with_types)

        print(picklable)
        with open(os.path.join(ds_dir, 'ds.pickle'), 'wb') as fp:
            pickle.dump(picklable, fp)

        with open(os.path.join(ds_dir, 'metadata.json'), 'w') as fp:
            json.dump(
                {
                    'name': name,
                    'source_type': source_type,
                    'source': source,
                    'created_at': str(datetime.datetime.now()).split('.')[0],
                    'updated_at': str(datetime.datetime.now()).split('.')[0],
                    'row_count': len(df),
                    'columns': [dict(name=x) for x in list(df.keys())]
                }, fp)

        return self.get_datasource_obj(name, avoid_crash=True)

    def get_datasource_obj(self, name, avoid_crash=False):
        ds_meta_dir = os.path.join(self.dir, name)
        ds_dir = os.path.join(ds_meta_dir, 'datasource')
        ds = None
        try:
            #resource.setrlimit(resource.RLIMIT_STACK, [0x10000000, resource.RLIM_INFINITY])
            #sys.setrecursionlimit(0x100000)
            with open(os.path.join(ds_dir, 'ds.pickle'), 'rb') as fp:
                picklable = pickle.load(fp)
                if avoid_crash:
                    return picklable
                try:
                    ds = eval(picklable['class'])(*picklable['args'],
                                                  **picklable['kwargs'])
                except:
                    ds = picklable

            return ds
        except Exception as e:
            print(f'\n{e}\n')
            return None