Example #1
0
class DataSourceDataNode(DataNode):
    type = 'mindsdb-datasource'

    def __init__(self, config):
        self.config = config
        self.datastore = DataStore(config)
        # self.mindsdb_native = MindsdbNative(config)

    def getTables(self):
        dss = self.datastore.get_datasources()
        return [x['name'] for x in dss]

    def hasTable(self, table):
        return table in self.getTables()

    def getTableColumns(self, table):
        ds = self.datastore.get_datasource(table)
        return [x['name'] for x in ds['columns']]

    def select(self,
               table,
               columns=None,
               where=None,
               where_data=None,
               order_by=None,
               group_by=None,
               came_from=None):
        data = self.datastore.get_data(table,
                                       where=None,
                                       limit=None,
                                       offset=None)
        return data['data']
class StreamController:
    def __init__(self,
                 name,
                 predictor,
                 stream_in,
                 stream_out,
                 anomaly_stream=None,
                 learning_stream=None,
                 learning_threshold=100):
        self.name = name
        self.predictor = predictor

        self.stream_in = stream_in
        self.stream_out = stream_out
        self.anomaly_stream = anomaly_stream

        self.learning_stream = learning_stream
        self.learning_threshold = learning_threshold
        self.learning_data = []

        self.company_id = os.environ.get('MINDSDB_COMPANY_ID', None)
        self.stop_event = Event()
        self.model_interface = ModelInterfaceWrapper(ModelInterface())
        self.data_store = DataStore()
        self.config = Config()

        p = db.session.query(db.Predictor).filter_by(
            company_id=self.company_id, name=self.predictor).first()
        if p is None:
            raise Exception(f'Predictor {predictor} doesn\'t exist')

        self.target = p.to_predict[0]

        ts_settings = p.learn_args.get('timeseries_settings', None)
        if not ts_settings['is_timeseries']:
            ts_settings = None

        if ts_settings is None:
            self.thread = Thread(target=StreamController._make_predictions,
                                 args=(self, ))
        else:
            self.ts_settings = ts_settings
            self.thread = Thread(target=StreamController._make_ts_predictions,
                                 args=(self, ))

        self.thread.start()

    def _is_anomaly(self, res):
        for k in res:
            if k.endswith('_anomaly') and res[k] is not None:
                return True
        return False

    def _consider_learning(self):
        if self.learning_stream is not None:
            self.learning_data.extend(self.learning_stream.read())
            if len(self.learning_data) >= self.learning_threshold:
                p = db.session.query(db.Predictor).filter_by(
                    company_id=self.company_id, name=self.predictor).first()
                ds_record = db.session.query(
                    db.Datasource).filter_by(id=p.datasource_id).first()

                df = pd.DataFrame.from_records(self.learning_data)
                name = 'name_' + str(time()).replace('.', '_')
                path = os.path.join(self.config['paths']['datasources'], name)
                df.to_csv(path)

                from_data = {
                    'class': 'FileDS',
                    'args': [path],
                    'kwargs': {},
                }

                self.data_store.save_datasource(name=name,
                                                source_type='file',
                                                source=path,
                                                file_path=path,
                                                company_id=self.company_id)
                ds = self.data_store.get_datasource(name, self.company_id)

                self.model_interface.adjust(p.name, from_data, ds['id'],
                                            self.company_id)
                self.learning_data.clear()

    def _make_predictions(self):
        while not self.stop_event.wait(0.5):
            self._consider_learning()
            for when_data in self.stream_in.read():
                preds = self.model_interface.predict(self.predictor, when_data,
                                                     'dict')
                for res in preds:
                    if self.anomaly_stream is not None and self._is_anomaly(
                            res):
                        self.anomaly_stream.write(res)
                    else:
                        self.stream_out.write(res)

    def _make_ts_predictions(self):
        window = self.ts_settings['window']

        order_by = self.ts_settings['order_by']
        order_by = [order_by] if isinstance(order_by, str) else order_by

        group_by = self.ts_settings.get('group_by', None)
        group_by = [group_by] if isinstance(group_by, str) else group_by

        cache = Cache(self.name)

        while not self.stop_event.wait(0.5):
            self._consider_learning()
            for when_data in self.stream_in.read():
                for ob in order_by:
                    if ob not in when_data:
                        raise Exception(
                            f'when_data doesn\'t contain order_by[{ob}]')

                for gb in group_by:
                    if gb not in when_data:
                        raise Exception(
                            f'when_data doesn\'t contain group_by[{gb}]')

                gb_value = tuple(
                    when_data[gb]
                    for gb in group_by) if group_by is not None else ''

                # because cache doesn't work for tuples
                # (raises Exception: tuple doesn't have "encode" attribute)
                gb_value = str(gb_value)

                with cache:
                    if gb_value not in cache:
                        cache[gb_value] = []

                    # do this because shelve-cache doesn't support
                    # in-place changing
                    records = cache[gb_value]
                    records.append(when_data)
                    cache[gb_value] = records

            with cache:
                for gb_value in cache.keys():
                    if len(cache[gb_value]) >= window:
                        cache[gb_value] = [
                            *sorted(
                                cache[gb_value],
                                # WARNING: assuming wd[ob] is numeric
                                key=lambda wd: tuple(wd[ob]
                                                     for ob in order_by))
                        ]

                        while len(cache[gb_value]) >= window:
                            res_list = self.model_interface.predict(
                                self.predictor, cache[gb_value][:window],
                                'dict')
                            if self.anomaly_stream is not None and self._is_anomaly(
                                    res_list[-1]):
                                self.anomaly_stream.write(res_list[-1])
                            else:
                                self.stream_out.write(res_list[-1])
                            cache[gb_value] = cache[gb_value][1:]