Exemple #1
0
    def shouldContinue(self):
        """
        Check if the training should continue
        :return:
        """

        model_name = self.model_name

        # check if stop training is set in which case we should exit the training

        model_data = self.persistent_model_metadata.find_one(
            {'model_name': self.model_name})  #type: PersistentModelMetadata

        if model_data is None:
            return False

        if model_data.stop_training == True:
            logging.info('[FORCED] Stopping model training....')
            return False

        elif model_data.kill_training == True:

            logging.info('[FORCED] Stopping model training....')
            self.persistent_model_metadata.delete()
            self.ml_model_info.delete()

            return False

        return True
Exemple #2
0
    def saveToDisk(self, local_files):
        """
        This method persists model into disk, and removes previous stored files of this model

        :param local_files: any previous files
        :return:
        """
        if local_files is not None:
            for file_response_object in local_files:
                try:
                    os.remove(file_response_object.path)
                except:
                    logging.info('Could not delete file {path}'.format(
                        path=file_response_object.path))

        file_id = '{model_name}.{ml_model_name}.{config_hash}'.format(
            model_name=self.model_name,
            ml_model_name=self.ml_model_name,
            config_hash=self.config_hash)
        return_objects = self.data_model_object.saveToDisk(file_id)

        file_ids = [ret.file_id for ret in return_objects]

        self.ml_model_info.fs_file_ids = file_ids
        self.ml_model_info.update()

        return return_objects
Exemple #3
0
    def __init__(self, model_name, data=None):
        """
        Load basic data needed to find the model data
        :param data: data to make predictions on
        :param model_name: the model to load
        :param submodel_name: if its also a submodel, the submodel name
        """

        self.model_name = model_name
        self.data = data
        self.persistent_model_metadata = PersistentModelMetadata()
        self.persistent_model_metadata.model_name = self.model_name
        self.persistent_ml_model_info = PersistentMlModelInfo()
        self.persistent_ml_model_info.model_name = self.model_name

        self.persistent_model_metadata = self.persistent_model_metadata.find_one(
            self.persistent_model_metadata.getPkey())

        # laod the most accurate model

        info = self.persistent_ml_model_info.find(
            {'model_name': self.model_name},
            order_by=[('r_squared', -1)],
            limit=1)

        if info is not None and len(info) > 0:
            self.persistent_ml_model_info = info[
                0]  #type: PersistentMlModelInfo
        else:
            # TODO: Make sure we have a model for this
            logging.info('No model found')
            return

        self.ml_model_name = self.persistent_ml_model_info.ml_model_name
        self.config_serialized = self.persistent_ml_model_info.config_serialized

        fs_file_ids = self.persistent_ml_model_info.fs_file_ids
        self.framework, self.dummy, self.ml_model_name = self.ml_model_name.split(
            '.')
        self.ml_model_module_path = 'mindsdb.libs.ml_models.' + self.framework + '.models.' + self.ml_model_name + '.' + self.ml_model_name
        self.ml_model_class_name = convert_snake_to_cammelcase_string(
            self.ml_model_name)

        self.ml_model_module = importlib.import_module(
            self.ml_model_module_path)
        self.ml_model_class = getattr(self.ml_model_module,
                                      self.ml_model_class_name)

        self.gfs_save_head_time = time.time(
        )  # the last time it was saved into GridFS, assume it was now

        logging.info('Starting model...')
        self.data_model_object = self.ml_model_class.loadFromDisk(
            file_ids=fs_file_ids)

        if self.data != None:
            self._loadData(data)
Exemple #4
0
    def predict(self, data=None):
        """
        This actually calls the model and returns the predictions in diff form

        :return: diffs, which is a list of dictionaries with pointers as to where to replace the prediction given the value that was predicted

        """

        if data != None:
            self._loadData(data)

        self.predict_sampler.variable_wrapper = self.ml_model_class.variable_wrapper
        self.predict_sampler.variable_unwrapper = self.ml_model_class.variable_unwrapper

        ret_diffs = []
        for batch in self.predict_sampler:

            logging.info('predicting batch...')
            if self.data_model_object.use_full_text_input:
                ret = self.data_model_object.forward(
                    batch.getInput(flatten=self.data_model_object.flatInput),
                    full_text_input=batch.getFullTextInput())
            else:
                ret = self.data_model_object.forward(
                    batch.getInput(flatten=self.data_model_object.flatInput))
            if type(ret) != type({}):
                ret_dict = batch.deflatTarget(ret)
            else:
                ret_dict = ret

            ret_dict_denorm = {}

            for col in ret_dict:
                ret_dict[col] = self.ml_model_class.variable_unwrapper(
                    ret_dict[col])
                for row in ret_dict[col]:
                    if col not in ret_dict_denorm:
                        ret_dict_denorm[col] = []

                    ret_dict_denorm[col] += [
                        denorm(
                            row,
                            self.persistent_model_metadata.column_stats[col])
                    ]

            ret_total_item = {
                'group_pointer': batch.group_pointer,
                'column_pointer': batch.column_pointer,
                'start_pointer': batch.start_pointer,
                'end_pointer': batch.end_pointer,
                'ret_dict': ret_dict_denorm
            }
            ret_diffs += [ret_total_item]

        return ret_diffs
Exemple #5
0
    def start(data, model_name):
        """
        We use this worker to parallel train different data models and data model configurations

        :param data: This is the vectorized data
        :param model_name: This will be the model name so we can pull stats and other
        :param data_model: This will be the data model name, which can let us find the data model implementation
        :param config: this is the hyperparameter config
        """

        w = PredictWorker(data, model_name)
        logging.info('Inferring from model and data...')
        return w.predict()
Exemple #6
0
    def run(self):

        # Handle transactions differently depending on the type of query
        # For now we only support LEARN and PREDICT

        # Train metadata is the metadata that was used when training the model,
        # note: that we need this train metadata even if we are predicting, so we can understand about the model
        train_metadata = None

        if self.transaction.metadata.type == TRANSACTION_PREDICT:
            # extract this from the persistent_model_metadata
            train_metadata = TransactionMetadata()
            train_metadata.setFromDict(
                self.transaction.persistent_model_metadata.train_metadata)

        elif self.transaction.metadata.type == TRANSACTION_LEARN:
            # Pull this straight from the the current transaction
            train_metadata = self.transaction.metadata

        else:
            # We cannot proceed without train metadata
            self.session.logging.error(
                'Do not support transaction {type}'.format(
                    type=self.transaction.metadata.type))
            self.transaction.error = True
            self.transaction.errorMsg = traceback.print_exc(1)
            return

        result = self.getPreparedInputDF(train_metadata)

        columns = list(result.columns.values)
        data_array = list(result.values.tolist())

        self.transaction.input_data.columns = columns

        # make sure that the column we are trying to predict is on the input_data
        # else fail, because we cannot predict data we dont have
        # TODO: Revise this, I may pass a source data that doesnt have the column I want to predict and that may still be ok if we are making a prediction that is not time series
        if len(data_array[0]
               ) > 0 and self.transaction.metadata.model_predict_columns:
            for col_target in self.transaction.metadata.model_predict_columns:
                if col_target not in self.transaction.input_data.columns:
                    err = 'Trying to predict column {column} but column not in source data'.format(
                        column=col_target)
                    self.session.logging.error(err)
                    self.transaction.error = True
                    self.transaction.errorMsg = err
                    return

        self.transaction.input_data.data_array = data_array

        # extract test data if this is a learn transaction and there is a test query
        if self.transaction.metadata.type == TRANSACTION_LEARN:

            # if a test_data set was given use it
            if self.transaction.metadata.test_from_data:
                df = self.transaction.metadata.test_from_data.df
                test_result = df.where((pandas.notnull(df)), None)

                columns = list(test_result.columns.values)
                data_array = test_result.values.tolist()

                # Make sure that test adn train sets match column wise
                if columns != self.transaction.input_data.columns:
                    err = 'Trying to get data for test but columns in train set and test set dont match'
                    self.session.logging.error(err)
                    self.transaction.error = True
                    self.transaction.errorMsg = err
                    return
                total_data_array = len(self.transaction.input_data.data_array)
                total_test_array = len(data_array)
                test_indexes = [
                    i for i in range(total_data_array, total_data_array +
                                     total_test_array)
                ]

                self.transaction.input_data.test_indexes = test_indexes
                # make the input data relevant
                self.transaction.input_data.data_array += data_array

                # we later use this to either regenerate or not
                test_prob = 0

            else:
                test_prob = CONFIG.TEST_TRAIN_RATIO

            validation_prob = CONFIG.TEST_TRAIN_RATIO / (1 - test_prob)

            group_by = self.transaction.metadata.model_group_by

            if group_by:
                try:
                    group_by_index = self.transaction.input_data.columns.index(
                        group_by)
                except:
                    group_by_index = None
                    err = 'Trying to group by, {column} but column not in source data'.format(
                        column=group_by)
                    self.session.logging.error(err)
                    self.transaction.error = True
                    self.transaction.errorMsg = err
                    return

                # get unique group by values
                #all_group_by_items_query = ''' select {group_by_column} as grp, count(1) as total from ( {query} ) sub group by {group_by_column}'''.format(group_by_column=group_by, query=self.transaction.metadata.model_query)
                #self.transaction.session.logging.debug('About to pull GROUP BY query {query}'.format(query=all_group_by_items_query))

                uniques = result.groupby([group_by]).size()
                all_group_by_values = uniques.index.tolist()
                uniques_counts = uniques.values.tolist()

                # create a list of values in group by, this is because result is array of array we want just array

                all_group_by_counts = {
                    value: uniques_counts[i]
                    for i, value in enumerate(all_group_by_values)
                }

                max_group_by = max(list(all_group_by_counts.values()))

                self.transaction.persistent_model_metadata.max_group_by_count = max_group_by

                # we will fill these depending on the test_prob and validation_prob
                test_group_by_values = []
                validation_group_by_values = []
                train_group_by_values = []

                # split the data into test, validation, train by group by data
                for group_by_value in all_group_by_values:

                    # depending on a random number if less than x_prob belongs to such group
                    # remember that test_prob can be 0 or the config value depending on if the test test was passed as a query
                    if float(random.random()) < test_prob and len(
                            train_group_by_values) > 0:
                        test_group_by_values += [group_by_value]
                    # elif float(random.random()) < validation_prob:
                    #     validation_group_by_values += [group_by_value]
                    else:
                        train_group_by_values += [group_by_value]

            for i, row in enumerate(self.transaction.input_data.data_array):

                in_test = True if i in self.transaction.input_data.test_indexes else False
                if not in_test:
                    if group_by:

                        group_by_value = row[group_by_index]
                        if group_by_value in test_group_by_values:
                            self.transaction.input_data.test_indexes += [i]
                        elif group_by_value in train_group_by_values:
                            self.transaction.input_data.train_indexes += [i]
                        elif group_by_value in validation_group_by_values:
                            self.transaction.input_data.validation_indexes += [
                                i
                            ]

                    else:
                        # remember that test_prob can be 0 or the config value depending on if the test test was passed as a query
                        if float(random.random()) <= test_prob or len(
                                self.transaction.input_data.test_indexes) == 0:
                            self.transaction.input_data.test_indexes += [i]
                        elif float(random.random()) <= validation_prob or len(
                                self.transaction.input_data.validation_indexes
                        ) == 0:
                            self.transaction.input_data.validation_indexes += [
                                i
                            ]
                        else:
                            self.transaction.input_data.train_indexes += [i]

            if len(self.transaction.input_data.test_indexes) == 0:
                logging.debug('Size of test set is zero, last split')
                ratio = CONFIG.TEST_TRAIN_RATIO
                if group_by and len(
                        self.transaction.input_data.train_indexes) > 2000:
                    # it seems to be a good practice to not overfit, to double the ratio, as time series data tends to be abundant
                    ratio = ratio * 2
                test_size = int(
                    len(self.transaction.input_data.train_indexes) * ratio)
                self.transaction.input_data.test_indexes = self.transaction.input_data.train_indexes[
                    -test_size:]
                self.transaction.input_data.train_indexes = self.transaction.input_data.train_indexes[:
                                                                                                      -test_size]

            logging.info('- Test: {size} rows'.format(
                size=len(self.transaction.input_data.test_indexes)))
            logging.info('- Train: {size} rows'.format(
                size=len(self.transaction.input_data.train_indexes)))
Exemple #7
0
    def train(self):
        """

        :return:
        """

        last_epoch = 0
        lowest_error = None
        highest_accuracy = 0
        local_files = None

        for i in range(len(self.data_model_object.learning_rates)):

            self.data_model_object.setLearningRateIndex(i)

            for train_ret in self.data_model_object.trainModel(
                    self.train_sampler):

                logging.debug(
                    'Training State epoch:{epoch}, batch:{batch}, loss:{loss}'.
                    format(epoch=train_ret.epoch,
                           batch=train_ret.batch,
                           loss=train_ret.loss))

                # save model every new epoch
                if last_epoch != train_ret.epoch:
                    last_epoch = train_ret.epoch
                    logging.debug(
                        'New epoch:{epoch}, testing and calculating error'.
                        format(epoch=last_epoch))
                    test_ret = self.data_model_object.testModel(
                        self.test_sampler)
                    logging.info(
                        'Test Error:{error}, Accuracy:{accuracy} | Best Accuracy so far: {best_accuracy}'
                        .format(error=test_ret.error,
                                accuracy=test_ret.accuracy,
                                best_accuracy=highest_accuracy))
                    is_it_lowest_error_epoch = False
                    # if lowest error save model
                    if lowest_error in [None]:
                        lowest_error = test_ret.error
                    if lowest_error > test_ret.error:
                        is_it_lowest_error_epoch = True
                        lowest_error = test_ret.error
                        highest_accuracy = test_ret.accuracy
                        logging.info(
                            '[SAVING MODEL] Lowest ERROR so far! - Test Error: {error}, Accuracy: {accuracy}'
                            .format(error=test_ret.error,
                                    accuracy=test_ret.accuracy))
                        logging.debug(
                            'Lowest ERROR so far! Saving: model {model_name}, {data_model} config:{config}'
                            .format(
                                model_name=self.model_name,
                                data_model=self.ml_model_name,
                                config=self.ml_model_info.config_serialized))

                        # save model local file
                        local_files = self.saveToDisk(local_files)
                        # throttle model saving into GridFS to 10 minutes
                        # self.saveToGridFs(local_files, throttle=True)

                        # save model predicted - real vectors
                        logging.debug(
                            'Saved: model {model_name}:{ml_model_name} state vars into db [OK]'
                            .format(model_name=self.model_name,
                                    ml_model_name=self.ml_model_name))

                    # check if continue training
                    if self.shouldContinue() == False:
                        return
                    # save/update model loss, error, confusion_matrix
                    self.registerModelData(train_ret, test_ret,
                                           is_it_lowest_error_epoch)

            logging.info(
                'Loading model from store for retrain on new learning rate {lr}'
                .format(lr=self.data_model_object.learning_rates[i]
                        [LEARNING_RATE_INDEX]))
            # after its done with the first batch group, get the one with the lowest error and keep training

            ml_model_info = self.ml_model_info.find_one({
                'model_name':
                self.model_name,
                'ml_model_name':
                self.ml_model_name,
                'config_serialized':
                json.dumps(self.config)
            })

            if ml_model_info is None:
                # TODO: Make sure we have a model for this
                logging.info('No model found in storage')
                return

            fs_file_ids = ml_model_info.fs_file_ids

            self.data_model_object = self.ml_model_class.loadFromDisk(
                file_ids=fs_file_ids)
Exemple #8
0
    def __init__(self,
                 data,
                 model_name,
                 ml_model_name='pytorch.models.column_based_fcnn',
                 config={}):
        """

        :param data:
        :type data: ModelData
        :param model_name:
        :param ml_model_name:
        :param config:
        """

        self.data = data
        self.model_name = model_name
        self.ml_model_name = ml_model_name
        self.config = config
        self.config_serialized = json.dumps(self.config)
        self.config_hash = hashtext(self.config_serialized)

        # get basic variables defined

        self.persistent_model_metadata = PersistentModelMetadata().find_one(
            {'model_name': self.model_name})

        self.ml_model_info = PersistentMlModelInfo()
        self.ml_model_info.model_name = self.model_name
        self.ml_model_info.ml_model_name = self.ml_model_name
        self.ml_model_info.config_serialized = self.config_serialized
        self.ml_model_info.insert()

        self.framework, self.dummy, self.data_model_name = self.ml_model_name.split(
            '.')
        self.ml_model_module_path = 'mindsdb.libs.ml_models.' + self.ml_model_name + '.' + self.data_model_name
        self.ml_model_class_name = convert_snake_to_cammelcase_string(
            self.data_model_name)

        self.ml_model_module = importlib.import_module(
            self.ml_model_module_path)
        self.ml_model_class = getattr(self.ml_model_module,
                                      self.ml_model_class_name)

        self.train_sampler = Sampler(
            self.data.train_set,
            metadata_as_stored=self.persistent_model_metadata,
            ignore_types=self.ml_model_class.ignore_types,
            sampler_mode=SAMPLER_MODES.LEARN)
        self.test_sampler = Sampler(
            self.data.test_set,
            metadata_as_stored=self.persistent_model_metadata,
            ignore_types=self.ml_model_class.ignore_types,
            sampler_mode=SAMPLER_MODES.LEARN)

        self.train_sampler.variable_wrapper = self.ml_model_class.variable_wrapper
        self.test_sampler.variable_wrapper = self.ml_model_class.variable_wrapper
        self.sample_batch = self.train_sampler.getSampleBatch()

        self.gfs_save_head_time = time.time(
        )  # the last time it was saved into GridFS, assume it was now

        logging.info('Starting model...')
        self.data_model_object = self.ml_model_class(self.sample_batch)
        logging.info('Training model...')
        self.train()
Exemple #9
0
    def run(self):

        # Handle transactions differently depending on the type of query
        # For now we only support LEARN and PREDICT

        train_metadata = self.transaction.metadata

        if self.transaction.metadata.type == TRANSACTION_PREDICT:

            self.populatePredictQuery()

            train_metadata = TransactionMetadata()
            train_metadata.setFromDict(self.transaction.persistent_model_metadata.train_metadata)

        elif self.transaction.metadata.type not in [TRANSACTION_PREDICT, TRANSACTION_LEARN]:

            self.session.logging.error('Do not support transaction {type}'.format(type=self.transaction.metadata.type))
            self.transaction.error = True
            self.transaction.errorMsg = traceback.print_exc(1)
            return



        query = self.prepareFullQuery(train_metadata)

        try:
            self.transaction.session.logging.info('About to pull query {query}'.format(query=query))
            conn = sqlite3.connect(self.transaction.metadata.storage_file)
            self.logging.info(self.transaction.metadata.model_query)
            df = pandas.read_sql_query(query, conn)
            result = df.where((pandas.notnull(df)), None)
            df = None # clean memory

        except Exception:

            self.session.logging.error(traceback.print_exc())
            self.transaction.error =True
            self.transaction.errorMsg = traceback.print_exc(1)
            return

        columns = list(result.columns.values)
        data_array = list(result.values.tolist())

        self.transaction.input_data.columns = columns

        if len(data_array[0])>0 and  self.transaction.metadata.model_predict_columns:
            for col_target in self.transaction.metadata.model_predict_columns:
                if col_target not in self.transaction.input_data.columns:
                    err = 'Trying to predict column {column} but column not in source data'.format(column=col_target)
                    self.session.logging.error(err)
                    self.transaction.error = True
                    self.transaction.errorMsg = err
                    return

        self.transaction.input_data.data_array = data_array

        # extract test data if this is a learn transaction and there is a test query
        if self.transaction.metadata.type == TRANSACTION_LEARN:

            if self.transaction.metadata.model_test_query:
                try:
                    test_query = query_wrapper.format(orig_query = self.transaction.metadata.model_test_query, order_by_string= order_by_string, where_not_null_string=where_not_null_string)
                    self.transaction.session.logging.info('About to pull TEST query {query}'.format(query=test_query))
                    #drill = self.session.drill.query(test_query, timeout=CONFIG.DRILL_TIMEOUT)
                    df = pandas.read_sql_query(test_query, conn)
                    result = df.where((pandas.notnull(df)), None)
                    df = None

                    #result = vars(drill)['data']
                except Exception:

                    # If testing offline, get results from a .cache file
                    self.session.logging.error(traceback.print_exc())
                    self.transaction.error = True
                    self.transaction.errorMsg = traceback.print_exc(1)
                    return

                columns = list(result.columns.values)
                data_array = result.values.tolist()

                # Make sure that test adn train sets match column wise
                if columns != self.transaction.input_data.columns:
                    err = 'Trying to get data for test but columns in train set and test set dont match'
                    self.session.logging.error(err)
                    self.transaction.error = True
                    self.transaction.errorMsg = err
                    return
                total_data_array = len(self.transaction.input_data.data_array)
                total_test_array =  len(data_array)
                test_indexes = [i for i in range(total_data_array, total_data_array+total_test_array)]

                self.transaction.input_data.test_indexes = test_indexes
                # make the input data relevant
                self.transaction.input_data.data_array += data_array

                # we later use this to either regenerate or not
                test_prob = 0

            else:
                test_prob = CONFIG.TEST_TRAIN_RATIO

            validation_prob = CONFIG.TEST_TRAIN_RATIO / (1-test_prob)

            group_by = self.transaction.metadata.model_group_by

            if group_by:
                try:
                    group_by_index = self.transaction.input_data.columns.index(group_by)
                except:
                    group_by_index = None
                    err = 'Trying to group by, {column} but column not in source data'.format(column=group_by)
                    self.session.logging.error(err)
                    self.transaction.error = True
                    self.transaction.errorMsg = err
                    return

                # get unique group by values
                all_group_by_items_query = ''' select {group_by_column} as grp, count(1) as total from ( {query} ) sub group by {group_by_column}'''.format(group_by_column=group_by, query=self.transaction.metadata.model_query)
                self.transaction.session.logging.debug('About to pull GROUP BY query {query}'.format(query=all_group_by_items_query))
                df = pandas.read_sql_query(all_group_by_items_query, conn)
                result = df.where((pandas.notnull(df)), None)
                # create a list of values in group by, this is because result is array of array we want just array

                all_group_by_counts = {i[0]:i[1] for i in result.values.tolist()}
                all_group_by_values = all_group_by_counts.keys()

                max_group_by = max(list(all_group_by_counts.values()))

                self.transaction.persistent_model_metadata.max_group_by_count = max_group_by

                # we will fill these depending on the test_prob and validation_prob
                test_group_by_values = []
                validation_group_by_values = []
                train_group_by_values = []

                # split the data into test, validation, train by group by data
                for group_by_value in all_group_by_values:

                    # depending on a random number if less than x_prob belongs to such group
                    # remember that test_prob can be 0 or the config value depending on if the test test was passed as a query
                    if float(random.random()) < test_prob and len(train_group_by_values) > 0:
                        test_group_by_values += [group_by_value]
                    # elif float(random.random()) < validation_prob:
                    #     validation_group_by_values += [group_by_value]
                    else:
                        train_group_by_values += [group_by_value]

            for i, row in enumerate(self.transaction.input_data.data_array):

                in_test = True if i in self.transaction.input_data.test_indexes else False
                if not in_test:
                    if group_by:

                        group_by_value = row[group_by_index]
                        if group_by_value in test_group_by_values :
                            self.transaction.input_data.test_indexes += [i]
                        elif group_by_value in train_group_by_values :
                            self.transaction.input_data.train_indexes += [i]
                        elif group_by_value in validation_group_by_values :
                            self.transaction.input_data.validation_indexes += [i]

                    else:
                        # remember that test_prob can be 0 or the config value depending on if the test test was passed as a query
                        if float(random.random()) <= test_prob or len(self.transaction.input_data.test_indexes) == 0:
                            self.transaction.input_data.test_indexes += [i]
                        elif float(random.random()) <= validation_prob or len(self.transaction.input_data.validation_indexes)==0:
                            self.transaction.input_data.validation_indexes += [i]
                        else:
                            self.transaction.input_data.train_indexes += [i]

            if len(self.transaction.input_data.test_indexes) == 0:
                logging.debug('Size of test set is zero, last split')
                ratio = CONFIG.TEST_TRAIN_RATIO
                if group_by and len(self.transaction.input_data.train_indexes) > 2000:
                    # it seems to be a good practice to not overfit, to double the ratio, as time series data tends to be abundant
                    ratio = ratio*2
                test_size = int(len(self.transaction.input_data.train_indexes) * ratio)
                self.transaction.input_data.test_indexes = self.transaction.input_data.train_indexes[-test_size:]
                self.transaction.input_data.train_indexes = self.transaction.input_data.train_indexes[:-test_size]

            logging.info('- Test: {size} rows'.format(size=len(self.transaction.input_data.test_indexes)))
            logging.info('- Train: {size} rows'.format(size=len(self.transaction.input_data.train_indexes)))