Esempio n. 1
0
def initialize_interfaces(config, app):
    app.default_store = DataStore(config)
    app.mindsdb_native = MindsdbNative(config)
Esempio n. 2
0
class MindsDBDataNode(DataNode):
    type = 'mindsdb'

    def __init__(self, config):
        self.config = config
        self.mindsdb_native = MindsdbNative(config)

    def getTables(self):
        models = self.mindsdb_native.get_models()
        models = [x['name'] for x in models if x['status'] == 'complete']
        models += ['predictors', 'commands']
        return models

    def hasTable(self, table):
        return table in self.getTables()

    def getTableColumns(self, table):
        if table == 'predictors':
            return [
                'name', 'status', 'accuracy', 'predict', 'select_data_query',
                'external_datasource', 'training_options'
            ]
        if table == 'commands':
            return ['command']
        model = self.mindsdb_native.get_model_data(name=table)
        columns = []
        columns += [
            x['column_name']
            for x in model['data_analysis']['input_columns_metadata']
        ]
        columns += [
            x['column_name']
            for x in model['data_analysis']['target_columns_metadata']
        ]
        columns += [f'{x}_original' for x in model['predict']]
        for col in model['predict']:
            if model['data_analysis_v2'][col]['typing'][
                    'data_type'] == 'Numeric':
                columns += [f"{col}_min", f"{col}_max"]
            columns += [f"{col}_confidence"]
            columns += [f"{col}_explain"]

        # TODO this should be added just for clickhouse queries
        columns += ['when_data', 'select_data_query', 'external_datasource']
        return columns

    def _select_predictors(self):
        models = self.mindsdb_native.get_models()
        return [
            {
                'name': x['name'],
                'status': x['status'],
                'accuracy':
                str(x['accuracy']) if x['accuracy'] is not None else None,
                'predict': ', '.join(x['predict']),
                'select_data_query': x['data_source'],
                'external_datasource': '',  # TODO
                'training_options': ''  # TODO ?
            } for x in models
        ]

    def delete_predictor(self, name):
        self.mindsdb_native.delete_model(name)

    def select(self,
               table,
               columns=None,
               where=None,
               where_data=None,
               order_by=None,
               group_by=None,
               came_from=None):
        ''' NOTE WHERE statements can be just $eq joined with 'and'
        '''
        if table == 'predictors':
            return self._select_predictors()
        if table == 'commands':
            return []

        original_when_data = None
        if 'when_data' in where:
            if len(where) > 1:
                raise ValueError(
                    "Should not be used any other keys in 'where', if 'when_data' used"
                )
            try:
                original_when_data = where['when_data']['$eq']
                where_data = json.loads(where['when_data']['$eq'])
                if isinstance(where_data, list) is False:
                    where_data = [where_data]
            except Exception:
                raise ValueError(
                    f'''Error while parse 'where_data'="{where_data}"''')

        external_datasource = None
        if 'external_datasource' in where:
            external_datasource = where['external_datasource']['$eq']
            del where['external_datasource']

        select_data_query = None
        if came_from is not None and 'select_data_query' in where:
            select_data_query = where['select_data_query']['$eq']
            del where['select_data_query']

            dbtype = self.config['integrations'][came_from]['type']
            if dbtype == 'clickhouse':
                ch = Clickhouse(self.config, came_from)
                res = ch._query(
                    select_data_query.strip(' ;\n') + ' FORMAT JSON')
                data = res.json()['data']
            elif dbtype == 'mariadb':
                maria = Mariadb(self.config, came_from)
                data = maria._query(select_data_query)
            elif dbtype == 'mysql':
                mysql = MySQL(self.config, came_from)
                data = mysql._query(select_data_query)
            elif dbtype == 'postgres':
                mysql = PostgreSQL(self.config, came_from)
                data = mysql._query(select_data_query)
            else:
                raise Exception(f'Unknown database type: {dbtype}')

            if where_data is None:
                where_data = data
            else:
                where_data += data

        new_where = {}
        if where_data is not None:
            where_data = pandas.DataFrame(where_data)
        else:
            for key, value in where.items():
                if isinstance(value, dict) is False or len(
                        value.keys()) != 1 or list(value.keys())[0] != '$eq':
                    # TODO value should be just string or number
                    raise Exception()
                new_where[key] = value['$eq']

            if len(new_where) == 0:
                return []

            where_data = [new_where]

        model = self.mindsdb_native.get_model_data(name=table)
        predicted_columns = model['predict']

        original_target_values = {}
        for col in predicted_columns:
            if where_data is not None:
                if col in where_data:
                    original_target_values[col + '_original'] = list(
                        where_data[col])
                else:
                    original_target_values[col +
                                           '_original'] = [None
                                                           ] * len(where_data)
            else:
                original_target_values[col + '_original'] = [None]

        res = self.mindsdb_native.predict(name=table, when_data=where_data)

        data = []
        keys = [x for x in list(res._data.keys()) if x in columns]
        min_max_keys = []
        for col in predicted_columns:
            if model['data_analysis_v2'][col]['typing'][
                    'data_type'] == 'Numeric':
                min_max_keys.append(col)

        length = len(res._data[predicted_columns[0]])
        for i in range(length):
            row = {}
            explanation = res[i].explain()
            for key in keys:
                row[key] = res._data[key][i]
                # +++ FIXME this fix until issue https://github.com/mindsdb/mindsdb/issues/591 not resolved
                typing = None
                if key in model['data_analysis_v2']:
                    typing = model['data_analysis_v2'][key]['typing'][
                        'data_subtype']

                if typing == 'Timestamp' and row[key] is not None:
                    timestamp = datetime.datetime.utcfromtimestamp(row[key])
                    row[key] = timestamp.strftime('%Y-%m-%d %H:%M:%S')
                elif typing == 'Date':
                    timestamp = datetime.datetime.utcfromtimestamp(row[key])
                    row[key] = timestamp.strftime('%Y-%m-%d')
                # ---
            for key in predicted_columns:
                row[key + '_confidence'] = explanation[key]['confidence']
                row[key + '_explain'] = json.dumps(explanation[key])
            for key in min_max_keys:
                row[key + '_min'] = min(
                    explanation[key]['confidence_interval'])
                row[key + '_max'] = max(
                    explanation[key]['confidence_interval'])
            row['select_data_query'] = select_data_query
            row['external_datasource'] = external_datasource
            row['when_data'] = original_when_data
            for k in original_target_values:
                row[k] = original_target_values[k][i]
            data.append(row)

        return data
Esempio n. 3
0
 def __init__(self, config):
     self.config = config
     self.mindsdb_native = MindsdbNative(config)
Esempio n. 4
0
def initialize_interfaces(config, app):
    app.default_store = DataStore(config)
    app.mindsdb_native = MindsdbNative(config)
    app.custom_models = CustomModels(config)
    app.config_obj = config
Esempio n. 5
0
    } for api in api_arr]

    for api in api_arr:
        api_name = api['name']
        if api_name not in config['api']:
            print(f"Trying run '{api_name}' API, but is no config for this api.")
            print(f"Please, fill config['api']['{api_name}']")
            sys.exit(0)

    start_functions = {
        'http': start_http,
        'mysql': start_mysql,
        'mongodb': start_mongo
    }

    mdb = MindsdbNative(config)
    cst = CustomModels(config)
    # @TODO Maybe just use `get_model_data` directly here ? Seems like a useless abstraction
    model_data_arr = [
        {
            'name': x['name'],
            'predict': x['predict'],
            'data_analysis': mdb.get_model_data(x['name'])['data_analysis_v2']
        } for x in mdb.get_models()
    ]

    for m in model_data_arr:
        if 'columns_to_ignore' in m['data_analysis']:
            del m['data_analysis']['columns_to_ignore']
        if 'train_std_dev' in m['data_analysis']:
            del m['data_analysis']['train_std_dev']
Esempio n. 6
0
 def __init__(self, config):
     self.config = config
     self.dir = config.paths['datasources']
     self.mindsdb_native = MindsdbNative(config)
Esempio n. 7
0
class DataStore():
    def __init__(self, config):
        self.config = config
        self.dir = config.paths['datasources']
        self.mindsdb_native = MindsdbNative(config)

    def get_analysis(self, ds):
        if isinstance(ds, str):
            return self.mindsdb_native.analyse_dataset(
                self.get_datasource_obj(ds))
        else:
            return self.mindsdb_native.analyse_dataset(ds)

    def get_datasources(self):
        datasource_arr = []
        for ds_name in os.listdir(self.dir):
            try:
                with open(os.path.join(self.dir, ds_name, 'metadata.json'),
                          'r') as fp:
                    try:
                        datasource = json.load(fp)
                        datasource['created_at'] = parse_dt(
                            datasource['created_at'].split('.')[0])
                        datasource['updated_at'] = parse_dt(
                            datasource['updated_at'].split('.')[0])
                        datasource_arr.append(datasource)
                    except Exception as e:
                        print(e)
            except Exception as e:
                print(e)
        return datasource_arr

    def get_data(self, name, where=None, limit=None, offset=None):
        if offset is None:
            offset = 0

        ds = self.get_datasource_obj(name)

        # @TODO Remove and add `offset` to the `filter` method of the datasource
        if limit is not None:
            filtered_ds = ds.filter(where=where, limit=limit + offset)
        else:
            filtered_ds = ds.filter(where=where)

        filtered_ds = filtered_ds.iloc[offset:]

        data = filtered_ds.to_dict(orient='records')
        return {
            'data': data,
            'rowcount': len(ds),
            'columns_names': filtered_ds.columns
        }

    def get_datasource(self, name):
        for ds in self.get_datasources():
            if ds['name'] == name:
                return ds
        return None

    def delete_datasource(self, name):
        shutil.rmtree(os.path.join(self.dir, name))

    def save_datasource(self, name, source_type, source, file_path=None):
        if source_type == 'file' and (file_path is None):
            raise Exception(
                '`file_path` argument required when source_type == "file"')

        for i in range(1, 1000):
            if name in [x['name'] for x in self.get_datasources()]:
                previous_index = i - 1
                name = name.replace(f'__{previous_index}__', '')
                name = f'{name}__{i}__'
            else:
                break

        ds_meta_dir = os.path.join(self.dir, name)
        os.mkdir(ds_meta_dir)

        try:
            if source_type == 'file':
                source = os.path.join(ds_meta_dir, source)
                shutil.move(file_path, source)
                ds = FileDS(source)

                picklable = {'class': 'FileDS', 'args': [source], 'kwargs': {}}

            elif source_type in self.config['integrations']:
                integration = self.config['integrations'][source_type]

                ds_class_map = {
                    'clickhouse': ClickhouseDS,
                    'mariadb': MariaDS,
                    'mysql': MySqlDS,
                    'postgres': PostgresDS,
                    'mssql': MSSQLDS,
                    'mongodb': MongoDS,
                    'snowflake': SnowflakeDS
                }

                try:
                    dsClass = ds_class_map[integration['type']]
                except KeyError:
                    raise KeyError(
                        f"Unknown DS type: {source_type}, type is {integration['type']}"
                    )

                if integration['type'] in ['clickhouse']:
                    picklable = {
                        'class': dsClass.__name__,
                        'args': [],
                        'kwargs': {
                            'query': source['query'],
                            'user': integration['user'],
                            'password': integration['password'],
                            'host': integration['host'],
                            'port': integration['port']
                        }
                    }
                    ds = dsClass(**picklable['kwargs'])

                elif integration['type'] in [
                        'mssql', 'postgres', 'mariadb', 'mysql'
                ]:
                    picklable = {
                        'class': dsClass.__name__,
                        'args': [],
                        'kwargs': {
                            'query': source['query'],
                            'user': integration['user'],
                            'password': integration['password'],
                            'host': integration['host'],
                            'port': integration['port']
                        }
                    }

                    if 'database' in integration:
                        picklable['kwargs']['database'] = integration[
                            'database']

                    if 'database' in source:
                        picklable['kwargs']['database'] = source['database']

                    ds = dsClass(**picklable['kwargs'])

                elif integration['type'] == 'snowflake':
                    picklable = {
                        'class': dsClass.__name__,
                        'args': [],
                        'kwargs': {
                            'query': source['query'],
                            'schema': source['schema'],
                            'warehouse': source['warehouse'],
                            'database': source['database'],
                            'host': integration['host'],
                            'password': integration['password'],
                            'user': integration['user'],
                            'account': integration['account']
                        }
                    }

                    ds = dsClass(**picklable['kwargs'])

                elif integration['type'] == 'mongodb':
                    picklable = {
                        'class': dsClass.__name__,
                        'args': [],
                        'kwargs': {
                            'database': source['database'],
                            'collection': source['collection'],
                            'query': source['find'],
                            'user': integration['user'],
                            'password': integration['password'],
                            'host': integration['host'],
                            'port': integration['port']
                        }
                    }

                    ds = dsClass(**picklable['kwargs'])
            else:
                # This probably only happens for urls
                ds = FileDS(source)
                picklable = {'class': 'FileDS', 'args': [source], 'kwargs': {}}

            df = ds.df

            if '' in df.columns or len(df.columns) != len(set(df.columns)):
                shutil.rmtree(ds_meta_dir)
                raise Exception(
                    'Each column in datasource must have unique name')

            # Not sure if needed
            #summary_analysis = self.get_analysis(ds.filter(limit=200))['data_analysis_v2']

            with open(os.path.join(ds_meta_dir, 'ds.pickle'), 'wb') as fp:
                pickle.dump(picklable, fp)

            with open(os.path.join(ds_meta_dir, 'metadata.json'), 'w') as fp:
                meta = {
                    'name': name,
                    'source_type': source_type,
                    'source': source,
                    'created_at': str(datetime.datetime.now()).split('.')[0],
                    'updated_at': str(datetime.datetime.now()).split('.')[0],
                    'row_count': len(df),
                    'columns': [dict(name=x) for x in list(df.keys())]
                }
                json.dump(meta, fp, indent=4, sort_keys=True)

            with open(os.path.join(ds_meta_dir, 'versions.json'), 'wt') as fp:
                json.dump(self.config.versions, fp, indent=4, sort_keys=True)

        except Exception:
            if os.path.isdir(ds_meta_dir):
                shutil.rmtree(ds_meta_dir)
            raise

        return self.get_datasource_obj(name, raw=True), name

    def get_datasource_obj(self, name, raw=False):
        ds_meta_dir = os.path.join(self.dir, name)
        ds = None
        try:
            with open(os.path.join(ds_meta_dir, 'ds.pickle'), 'rb') as fp:
                picklable = pickle.load(fp)
                if raw:
                    return picklable
                try:
                    ds = eval(picklable['class'])(*picklable['args'],
                                                  **picklable['kwargs'])
                except Exception:
                    ds = picklable
            return ds
        except Exception as e:
            print(f'\n{e}\n')
            return None
Esempio n. 8
0
class CustomModels():
    def __init__(self, config):
        self.config = config
        self.dbw = DatabaseWrapper(self.config)
        self.storage_dir = os.path.join(config['storage_dir'], 'misc')
        os.makedirs(self.storage_dir, exist_ok=True)
        self.model_cache = {}
        self.mindsdb_native = MindsdbNative(self.config)
        self.dbw = DatabaseWrapper(self.config)

    def _dir(self, name):
        return str(os.path.join(self.storage_dir, 'custom_model_' + name))

    def _internal_load(self, name):

        # Caching (2 lines bellow), currently disabled due to multiprocessing cache invalidation issues
        #if name in self.model_cache:
        #    return self.model_cache[name]

        # "Proper" model loading (3 lines bellow), currently disabled due to pickling issues
        #spec = importlib.util.spec_from_file_location(name, self._dir(name) + '/model.py')
        #module = importlib.util.module_from_spec(spec)
        #spec.loader.exec_module(module)

        sys.path.insert(0, self._dir(name))
        module = __import__(name)

        try:
            model = module.Model.load(
                os.path.join(self._dir(name), 'model.pickle'))
        except Exception as e:
            model = module.Model()
            model.initialize_column_types()
            if hasattr(model, 'setup'):
                model.setup()

        self.model_cache[name] = model

        return model

    def learn(self, name, from_data, to_predict, kwargs={}):
        model_data = self.get_model_data(name)
        model_data['status'] = 'training'
        self.save_model_data(name, model_data)

        to_predict = to_predict if isinstance(to_predict,
                                              list) else [to_predict]
        data_source = getattr(mindsdb_native,
                              from_data['class'])(*from_data['args'],
                                                  **from_data['kwargs'])
        data_frame = data_source._df
        model = self._internal_load(name)
        model.to_predict = to_predict

        model_data = self.get_model_data(name)
        model_data['predict'] = model.to_predict
        self.save_model_data(name, model_data)

        data_analysis = self.mindsdb_native.analyse_dataset(
            data_source)['data_analysis_v2']

        model_data = self.get_model_data(name)
        model_data['data_analysis'] = data_analysis
        self.save_model_data(name, model_data)

        model.fit(data_frame, to_predict, data_analysis, kwargs)

        model.save(os.path.join(self._dir(name), 'model.pickle'))
        self.model_cache[name] = model

        model_data = self.get_model_data(name)
        model_data['status'] = 'completed'
        self.save_model_data(name, model_data)

        self.dbw.unregister_predictor(name)
        self.dbw.register_predictors([self.get_model_data(name)], setup=False)

    def predict(self, name, when_data=None, from_data=None, kwargs={}):
        if from_data is not None:
            data_source = getattr(mindsdb_native,
                                  from_data['class'])(*from_data['args'],
                                                      **from_data['kwargs'])
            data_frame = data_source._df
        elif when_data is not None:
            if isinstance(when_data, dict):
                for k in when_data:
                    when_data[k] = [when_data[k]]
                data_frame = pd.DataFrame(when_data)
            else:
                data_frame = pd.DataFrame(when_data)

        model = self._internal_load(name)
        predictions = model.predict(data_frame, kwargs)

        pred_arr = []
        for i in range(len(predictions)):
            pred_arr.append({})
            pred_arr[-1] = {}
            for col in predictions.columns:
                pred_arr[-1][col] = {}
                pred_arr[-1][col]['predicted_value'] = predictions[col].iloc[i]

        return pred_arr

    def get_model_data(self, name):
        with open(os.path.join(self._dir(name), 'metadata.json'), 'r') as fp:
            return json.load(fp)

    def save_model_data(self, name, data):
        with open(os.path.join(self._dir(name), 'metadata.json'), 'w') as fp:
            json.dump(data, fp)

    def get_models(self, status='any'):
        models = []
        for model_dir in os.listdir(self.storage_dir):
            if 'custom_model_' in model_dir:
                name = model_dir.replace('custom_model_', '')
                try:
                    models.append(self.get_model_data(name))
                except:
                    print(f'Model {name} not found !')

        return models

    def delete_model(self, name):
        shutil.rmtree(self._dir(name))
        self.dbw.unregister_predictor(name)

    def rename_model(self, name, new_name):
        self.dbw.unregister_predictor(name)
        shutil.move(self._dir(name), self._dir(new_name))
        shutil.move(os.path.join(self._dir(new_name) + f'{name}.py'),
                    os.path.join(self._dir(new_name), f'{new_name}.py'))
        self.dbw.register_predictors([self.get_model_data(new_name)],
                                     setup=False)

    def export_model(self, name):
        shutil.make_archive(base_name=name,
                            format='zip',
                            root_dir=self._dir(name))
        return str(self._dir(name)) + '.zip'

    def load_model(self, fpath, name, trained_status):
        shutil.unpack_archive(fpath, self._dir(name), 'zip')
        shutil.move(os.path.join(self._dir(name), 'model.py'),
                    os.path.join(self._dir(name), f'{name}.py'))
        model = self._internal_load(name)
        model.to_predict = model.to_predict if isinstance(
            model.to_predict, list) else [model.to_predict]
        self.save_model_data(
            name, {
                'name': name,
                'data_analysis': model.column_type_map,
                'predict': model.to_predict,
                'status': trained_status,
                'is_custom': True
            })

        with open(os.path.join(self._dir(name), '__init__.py'), 'w') as fp:
            fp.write('')

        if trained_status == 'trained':
            self.dbw.register_predictors([self.get_model_data(name)],
                                         setup=False)