Example #1
0
    def predict(self, when={}, when_data = None, update_cached_model = False, use_gpu=True):
        """
        You have a mind trained already and you want to make a prediction

        :param when: use this if you have certain conditions for a single prediction
        :param when_data: (optional) use this when you have data in either a file, a pandas data frame, or url to a file that you want to predict from
        :param update_cached_model: (optional, default:False) when you run predict for the first time, it loads the latest model in memory, you can force it to do this on this run by flipping it to True

        :return: TransactionOutputData object
        """

        transaction_type = TRANSACTION_PREDICT
        when_ds = None if when_data is None else getDS(when_data)


        # lets turn into lists: when
        when = [when] if type(when) in [type(None), type({})] else when

        heavy_transaction_metadata = {}
        if when_ds is None:
            heavy_transaction_metadata['when_data'] = None
        else:
            heavy_transaction_metadata['when_data'] = when_ds
        heavy_transaction_metadata['model_when_conditions'] = when
        heavy_transaction_metadata['name'] = self.name

        light_transaction_metadata = {}
        light_transaction_metadata['name'] = self.name
        light_transaction_metadata['type'] = transaction_type
        light_transaction_metadata['use_gpu'] = use_gpu
        light_transaction_metadata['data_preparation'] = {}

        transaction = Transaction(session=self, light_transaction_metadata=light_transaction_metadata, heavy_transaction_metadata=heavy_transaction_metadata)

        return transaction.output_data
Example #2
0
    def analyse_dataset(self, from_data, sample_margin_of_error=CONFIG.DEFAULT_MARGIN_OF_ERROR):
        """
        Analyse the particular dataset being given
        """
        from_ds = getDS(from_data)
        transaction_type = TRANSACTION_ANALYSE
        sample_confidence_level = 1 - sample_margin_of_error

        heavy_transaction_metadata = {}
        heavy_transaction_metadata['name'] = self.name
        heavy_transaction_metadata['from_data'] = from_ds

        light_transaction_metadata = {}
        light_transaction_metadata['version'] = str(__version__)
        light_transaction_metadata['name'] = self.name
        light_transaction_metadata['model_columns_map'] = from_ds._col_map
        light_transaction_metadata['type'] = transaction_type
        light_transaction_metadata['sample_margin_of_error'] = sample_margin_of_error
        light_transaction_metadata['sample_confidence_level'] = sample_confidence_level
        light_transaction_metadata['model_is_time_series'] = False
        light_transaction_metadata['model_group_by'] = []
        light_transaction_metadata['model_order_by'] = []
        light_transaction_metadata['malformed_columns'] = []
        light_transaction_metadata['data_preparation'] = {}

        Transaction(session=self, light_transaction_metadata=light_transaction_metadata, heavy_transaction_metadata=heavy_transaction_metadata, logger=self.log)
Example #3
0
def transaction(logger):
    lmd, hmd = {}, {}

    lmd['type'] = 'foobar'
    transaction = Transaction(session=session,
                              light_transaction_metadata=lmd,
                              heavy_transaction_metadata=hmd,
                              logger=logger)
    return transaction
Example #4
0
    def analyse_dataset(self, from_data, sample_margin_of_error=0.005):
        """
        Analyse the particular dataset being given
        """

        from_ds = getDS(from_data)
        transaction_type = TRANSACTION_ANALYSE
        sample_confidence_level = 1 - sample_margin_of_error

        heavy_transaction_metadata = {}
        heavy_transaction_metadata['name'] = self.name
        heavy_transaction_metadata['from_data'] = from_ds

        light_transaction_metadata = {}
        light_transaction_metadata['version'] = str(__version__)
        light_transaction_metadata['name'] = self.name
        light_transaction_metadata['model_columns_map'] = from_ds._col_map
        light_transaction_metadata['type'] = transaction_type
        light_transaction_metadata[
            'sample_margin_of_error'] = sample_margin_of_error
        light_transaction_metadata[
            'sample_confidence_level'] = sample_confidence_level
        light_transaction_metadata['model_is_time_series'] = False
        light_transaction_metadata['model_group_by'] = []
        light_transaction_metadata['model_order_by'] = []
        light_transaction_metadata['columns_to_ignore'] = []
        light_transaction_metadata['data_preparation'] = {}
        light_transaction_metadata['predict_columns'] = []
        light_transaction_metadata['empty_columns'] = []

        light_transaction_metadata['handle_foreign_keys'] = True
        light_transaction_metadata['force_categorical_encoding'] = []
        light_transaction_metadata['handle_text_as_categorical'] = False

        light_transaction_metadata['data_types'] = {}
        light_transaction_metadata['data_subtypes'] = {}

        Transaction(session=self,
                    light_transaction_metadata=light_transaction_metadata,
                    heavy_transaction_metadata=heavy_transaction_metadata,
                    logger=self.log)
        return self.get_model_data(model_name=None,
                                   lmd=light_transaction_metadata)
Example #5
0
    def predict(self,
                when=None,
                when_data=None,
                update_cached_model=False,
                use_gpu=None,
                unstable_parameters_dict=None,
                backend=None,
                run_confidence_variation_analysis=False):
        """
        You have a mind trained already and you want to make a prediction

        :param when: use this if you have certain conditions for a single prediction
        :param when_data: use this when you have data in either a file, a pandas data frame, or url to a file that you want to predict from
        :param update_cached_model: (optional, default:False) when you run predict for the first time, it loads the latest model in memory, you can force it to do this on this run by flipping it to True
        :param run_confidence_variation_analysis: Run a confidence variation analysis on each of the given input column, currently only works when making single predictions via `when`

        :return: TransactionOutputData object
        """

        if when is None:
            when = {}

        if unstable_parameters_dict is None:
            unstable_parameters_dict = {}

        if run_confidence_variation_analysis is True and when_data is not None:
            error_msg = 'run_confidence_variation_analysis=True is a valid option only when predicting a single data point via `when`'
            self.log.error(error_msg)
            raise ValueError(error_msg)

        transaction_type = TRANSACTION_PREDICT
        when_ds = None if when_data is None else getDS(when_data)

        # lets turn into lists: when
        when = [when] if type(when) in [type(None), type({})] else when

        heavy_transaction_metadata = {}
        if when_ds is None:
            heavy_transaction_metadata['when_data'] = None
        else:
            heavy_transaction_metadata['when_data'] = when_ds
        heavy_transaction_metadata['model_when_conditions'] = when
        heavy_transaction_metadata['name'] = self.name

        if backend is not None:
            heavy_transaction_metadata['model_backend'] = backend

        light_transaction_metadata = {}
        light_transaction_metadata['name'] = self.name
        light_transaction_metadata['type'] = transaction_type
        light_transaction_metadata['use_gpu'] = use_gpu
        light_transaction_metadata['data_preparation'] = {}
        light_transaction_metadata[
            'run_confidence_variation_analysis'] = run_confidence_variation_analysis

        if 'force_disable_cache' in unstable_parameters_dict:
            light_transaction_metadata[
                'force_disable_cache'] = unstable_parameters_dict[
                    'force_disable_cache']
        else:
            light_transaction_metadata['force_disable_cache'] = False

        transaction = Transaction(
            session=self,
            light_transaction_metadata=light_transaction_metadata,
            heavy_transaction_metadata=heavy_transaction_metadata)

        return transaction.output_data
Example #6
0
    def learn(self,
              to_predict,
              from_data,
              test_from_data=None,
              group_by=None,
              window_size=None,
              order_by=None,
              sample_margin_of_error=0.005,
              ignore_columns=None,
              stop_training_in_x_seconds=None,
              stop_training_in_accuracy=None,
              backend='lightwood',
              rebuild_model=True,
              use_gpu=None,
              disable_optional_analysis=False,
              equal_accuracy_for_all_output_categories=True,
              output_categories_importance_dictionary=None,
              unstable_parameters_dict=None):
        """
        Learn to predict a column or columns from the data in 'from_data'

        Mandatory arguments:
        :param to_predict: what column or columns you want to predict
        :param from_data: the data that you want to learn from, this can be either a file, a pandas data frame, or url or a mindsdb data source

        Optional arguments:
        :param test_from_data: If you would like to test this learning from a different data set

        Optional Time series arguments:
        :param order_by: this order by defines the time series, it can be a list. By default it sorts each sort by column in ascending manner, if you want to change this pass a touple ('column_name', 'boolean_for_ascending <default=true>')
        :param group_by: This argument tells the time series that it should learn by grouping rows by a given id
        :param window_size: The number of samples to learn from in the time series

        Optional data transformation arguments:
        :param ignore_columns: mindsdb will ignore this column

        Optional sampling parameters:
        :param sample_margin_of_error (DEFAULT 0): Maximum expected difference between the true population parameter, such as the mean, and the sample estimate.

        Optional debug arguments:
        :param stop_training_in_x_seconds: (default None), if set, you want training to finish in a given number of seconds

        :return:
        """

        if order_by is None:
            order_by = []

        if ignore_columns is None:
            ignore_columns = []

        if unstable_parameters_dict is None:
            unstable_parameters_dict = {}

        from_ds = getDS(from_data)
        test_from_ds = test_from_data if test_from_data is None else getDS(
            test_from_data)

        transaction_type = TRANSACTION_LEARN
        sample_confidence_level = 1 - sample_margin_of_error

        # lets turn into lists: predict, order_by and group by
        predict_columns = [to_predict
                           ] if type(to_predict) != type([]) else to_predict
        group_by = group_by if type(group_by) == type(
            []) else [group_by] if group_by else []
        order_by = order_by if type(order_by) == type(
            []) else [order_by] if order_by else []

        if len(predict_columns) == 0:
            error = 'You need to specify a column to predict'
            self.log.error(error)
            raise ValueError(error)

        # lets turn order by into tuples if not already
        # each element ('column_name', 'boolean_for_ascending <default=true>')
        order_by = [(col_name, True) if type(col_name) != type(
            ()) else col_name for col_name in order_by]

        is_time_series = True if len(order_by) > 0 else False
        '''
        We don't implement "name" as a concept in mindsdbd data sources, this is only available for files,
        the server doesn't handle non-file data sources at the moment, so this shouldn't prove an issue,
        once we want to support datasources such as s3 and databases for the server we need to add name as a concept (or, preferably, before that)
        '''
        data_source_name = from_data if type(from_data) == str else 'Unkown'

        heavy_transaction_metadata = {}
        heavy_transaction_metadata['name'] = self.name
        heavy_transaction_metadata['from_data'] = from_ds
        heavy_transaction_metadata['test_from_data'] = test_from_ds
        heavy_transaction_metadata['bucketing_algorithms'] = {}
        heavy_transaction_metadata['predictions'] = None
        heavy_transaction_metadata['model_backend'] = backend

        light_transaction_metadata = {}
        light_transaction_metadata['version'] = str(__version__)
        light_transaction_metadata['name'] = self.name
        light_transaction_metadata['data_preparation'] = {}
        light_transaction_metadata['predict_columns'] = predict_columns
        light_transaction_metadata['model_columns_map'] = from_ds._col_map
        light_transaction_metadata['model_group_by'] = group_by
        light_transaction_metadata['model_order_by'] = order_by
        light_transaction_metadata['model_is_time_series'] = is_time_series
        light_transaction_metadata['data_source'] = data_source_name
        light_transaction_metadata['type'] = transaction_type
        light_transaction_metadata['window_size'] = window_size
        light_transaction_metadata[
            'sample_margin_of_error'] = sample_margin_of_error
        light_transaction_metadata[
            'sample_confidence_level'] = sample_confidence_level
        light_transaction_metadata[
            'stop_training_in_x_seconds'] = stop_training_in_x_seconds
        light_transaction_metadata['rebuild_model'] = rebuild_model
        light_transaction_metadata['model_accuracy'] = {
            'train': {},
            'test': {}
        }
        light_transaction_metadata['column_importances'] = None
        light_transaction_metadata['columns_buckets_importances'] = None
        light_transaction_metadata['columnless_prediction_distribution'] = None
        light_transaction_metadata[
            'all_columns_prediction_distribution'] = None
        light_transaction_metadata['use_gpu'] = use_gpu
        light_transaction_metadata['columns_to_ignore'] = ignore_columns
        light_transaction_metadata[
            'disable_optional_analysis'] = disable_optional_analysis
        light_transaction_metadata['validation_set_accuracy'] = None
        light_transaction_metadata['lightwood_data'] = {}
        light_transaction_metadata['ludwig_data'] = {}
        light_transaction_metadata['weight_map'] = {}
        light_transaction_metadata['confusion_matrices'] = {}
        light_transaction_metadata['empty_columns'] = []

        light_transaction_metadata[
            'equal_accuracy_for_all_output_categories'] = equal_accuracy_for_all_output_categories
        light_transaction_metadata['output_categories_importance_dictionary'] = output_categories_importance_dictionary if output_categories_importance_dictionary is not None else {}

        if 'skip_model_training' in unstable_parameters_dict:
            light_transaction_metadata[
                'skip_model_training'] = unstable_parameters_dict[
                    'skip_model_training']
        else:
            light_transaction_metadata['skip_model_training'] = False

        if 'skip_stats_generation' in unstable_parameters_dict:
            light_transaction_metadata[
                'skip_stats_generation'] = unstable_parameters_dict[
                    'skip_stats_generation']
        else:
            light_transaction_metadata['skip_stats_generation'] = False

        if 'optimize_model' in unstable_parameters_dict:
            light_transaction_metadata[
                'optimize_model'] = unstable_parameters_dict['optimize_model']
        else:
            light_transaction_metadata['optimize_model'] = False

        if 'force_disable_cache' in unstable_parameters_dict:
            light_transaction_metadata[
                'force_disable_cache'] = unstable_parameters_dict[
                    'force_disable_cache']
        else:
            light_transaction_metadata['force_disable_cache'] = False

        if 'force_categorical_encoding' in unstable_parameters_dict:
            light_transaction_metadata[
                'force_categorical_encoding'] = unstable_parameters_dict[
                    'force_categorical_encoding']
        else:
            light_transaction_metadata['force_categorical_encoding'] = []

        if 'handle_foreign_keys' in unstable_parameters_dict:
            light_transaction_metadata[
                'handle_foreign_keys'] = unstable_parameters_dict[
                    'handle_foreign_keys']
        else:
            light_transaction_metadata['handle_foreign_keys'] = False

        if 'handle_text_as_categorical' in unstable_parameters_dict:
            light_transaction_metadata[
                'handle_text_as_categorical'] = unstable_parameters_dict[
                    'handle_text_as_categorical']
        else:
            light_transaction_metadata['handle_text_as_categorical'] = False

        if 'use_selfaware_model' in unstable_parameters_dict:
            light_transaction_metadata[
                'use_selfaware_model'] = unstable_parameters_dict[
                    'use_selfaware_model']
        else:
            light_transaction_metadata['use_selfaware_model'] = True

        if rebuild_model is False:
            old_lmd = {}
            for k in light_transaction_metadata:
                old_lmd[k] = light_transaction_metadata[k]

            old_hmd = {}
            for k in heavy_transaction_metadata:
                old_hmd[k] = heavy_transaction_metadata[k]

            with open(
                    os.path.join(
                        CONFIG.MINDSDB_STORAGE_PATH,
                        light_transaction_metadata['name'] +
                        '_light_model_metadata.pickle'), 'rb') as fp:
                light_transaction_metadata = pickle.load(fp)

            with open(
                    os.path.join(
                        CONFIG.MINDSDB_STORAGE_PATH,
                        heavy_transaction_metadata['name'] +
                        '_heavy_model_metadata.pickle'), 'rb') as fp:
                heavy_transaction_metadata = pickle.load(fp)

            for k in [
                    'data_preparation', 'rebuild_model', 'data_source', 'type',
                    'columns_to_ignore', 'sample_margin_of_error',
                    'sample_confidence_level', 'stop_training_in_x_seconds'
            ]:
                if old_lmd[k] is not None:
                    light_transaction_metadata[k] = old_lmd[k]

            for k in ['from_data', 'test_from_data']:
                if old_hmd[k] is not None:
                    heavy_transaction_metadata[k] = old_hmd[k]
        Transaction(session=self,
                    light_transaction_metadata=light_transaction_metadata,
                    heavy_transaction_metadata=heavy_transaction_metadata,
                    logger=self.log)
Example #7
0
    def learn(self,
              to_predict,
              from_data=None,
              test_from_data=None,
              group_by=None,
              window_size_samples=None,
              window_size_seconds=None,
              window_size=None,
              order_by=[],
              sample_margin_of_error=CONFIG.DEFAULT_MARGIN_OF_ERROR,
              ignore_columns=[],
              rename_strange_columns=False,
              stop_training_in_x_seconds=None,
              stop_training_in_accuracy=None,
              send_logs=CONFIG.SEND_LOGS,
              backend='ludwig',
              rebuild_model=True,
              use_gpu=True,
              disable_optional_analysis=False):
        """
        Tells the mind to learn to predict a column or columns from the data in 'from_data'

        Mandatory arguments:
        :param to_predict: what column or columns you want to predict
        :param from_data: the data that you want to learn from, this can be either a file, a pandas data frame, or url to a file

        Optional arguments:
        :param test_from_data: If you would like to test this learning from a different data set

        Optional Time series arguments:
        :param order_by: this order by defines the time series, it can be a list. By default it sorts each sort by column in ascending manner, if you want to change this pass a touple ('column_name', 'boolean_for_ascending <default=true>')
        :param group_by: This argument tells the time series that it should learn by grouping rows by a given id
        :param window_size: The number of samples to learn from in the time series

        Optional data transformation arguments:
        :param ignore_columns: it simply removes the columns from the data sources
        :param rename_strange_columns: this tells mindsDB that if columns have special characters, it should try to rename them, this is a legacy argument, as now mindsdb supports any column name

        Optional sampling parameters:
        :param sample_margin_error (DEFAULT 0): Maximum expected difference between the true population parameter, such as the mean, and the sample estimate.

        Optional debug arguments:
        :param send_logs: If you want to stream these logs to a server
        :param stop_training_in_x_seconds: (default None), if set, you want training to finish in a given number of seconds

        :return:
        """

        # Backwards compatibility of interface
        if window_size is not None:
            window_size_samples = window_size
        #

        from_ds = getDS(from_data)
        test_from_ds = test_from_data if test_from_data is None else getDS(
            test_from_data)

        transaction_type = TRANSACTION_LEARN
        sample_confidence_level = 1 - sample_margin_of_error
        predict_columns_map = {}

        # lets turn into lists: predict, order_by and group by
        predict_columns = [to_predict
                           ] if type(to_predict) != type([]) else to_predict
        group_by = group_by if type(group_by) == type(
            []) else [group_by] if group_by else []
        order_by = order_by if type(order_by) == type(
            []) else [order_by] if order_by else []

        if len(predict_columns) == 0:
            error = 'You need to specify a column to predict'
            self.log.error(error)
            raise ValueError(error)

        # lets turn order by into tuples if not already
        # each element ('column_name', 'boolean_for_ascending <default=true>')
        order_by = [(col_name, True) if type(col_name) != type(
            ()) else col_name for col_name in order_by]

        is_time_series = True if len(order_by) > 0 else False

        if rename_strange_columns is False:
            for predict_col in predict_columns:
                predict_col_as_in_df = from_ds.getColNameAsInDF(predict_col)
                predict_columns_map[predict_col_as_in_df] = predict_col

            predict_columns = list(predict_columns_map.keys())
        else:
            self.log.warning(
                'Note that after version 1.0, the default value for argument rename_strange_columns in MindsDB().learn, will be flipped from True to False, this means that if your data has columns with special characters, MindsDB will not try to rename them by default.'
            )

        heavy_transaction_metadata = {}
        heavy_transaction_metadata['name'] = self.name
        heavy_transaction_metadata['from_data'] = from_ds
        heavy_transaction_metadata['test_from_data'] = test_from_ds
        heavy_transaction_metadata['bucketing_algorithms'] = {}
        heavy_transaction_metadata['predictions'] = None

        light_transaction_metadata = {}
        light_transaction_metadata['version'] = str(__version__)
        light_transaction_metadata['name'] = self.name
        light_transaction_metadata['data_preparation'] = {}
        light_transaction_metadata['model_backend'] = backend
        light_transaction_metadata['predict_columns'] = predict_columns
        light_transaction_metadata[
            'model_columns_map'] = {} if rename_strange_columns else from_ds._col_map
        light_transaction_metadata['model_group_by'] = group_by
        light_transaction_metadata['model_order_by'] = order_by
        light_transaction_metadata['window_size_samples'] = window_size_samples
        light_transaction_metadata['window_size_seconds'] = window_size_seconds
        light_transaction_metadata['model_is_time_series'] = is_time_series
        light_transaction_metadata['data_source'] = from_data
        light_transaction_metadata['type'] = transaction_type
        light_transaction_metadata['ignore_columns'] = ignore_columns
        light_transaction_metadata[
            'sample_margin_of_error'] = sample_margin_of_error
        light_transaction_metadata[
            'sample_confidence_level'] = sample_confidence_level
        light_transaction_metadata[
            'stop_training_in_x_seconds'] = stop_training_in_x_seconds
        light_transaction_metadata[
            'stop_training_in_accuracy'] = stop_training_in_accuracy
        light_transaction_metadata['rebuild_model'] = rebuild_model
        light_transaction_metadata['model_accuracy'] = {
            'train': {},
            'test': {}
        }
        light_transaction_metadata['column_importances'] = None
        light_transaction_metadata['columns_buckets_importances'] = None
        light_transaction_metadata['columnless_prediction_distribution'] = None
        light_transaction_metadata[
            'all_columns_prediction_distribution'] = None
        light_transaction_metadata['use_gpu'] = use_gpu
        light_transaction_metadata['malformed_columns'] = {
            'names': [],
            'indices': []
        }
        light_transaction_metadata[
            'disable_optional_analysis'] = disable_optional_analysis
        light_transaction_metadata['validation_set_accuracy'] = None

        if rebuild_model is False:
            old_lmd = {}
            for k in light_transaction_metadata:
                old_lmd[k] = light_transaction_metadata[k]

            old_hmd = {}
            for k in heavy_transaction_metadata:
                old_hmd[k] = heavy_transaction_metadata[k]

            with open(
                    os.path.join(
                        CONFIG.MINDSDB_STORAGE_PATH,
                        light_transaction_metadata['name'] +
                        '_light_model_metadata.pickle'), 'rb') as fp:
                light_transaction_metadata = pickle.load(fp)

            with open(
                    os.path.join(
                        CONFIG.MINDSDB_STORAGE_PATH,
                        heavy_transaction_metadata['name'] +
                        '_heavy_model_metadata.pickle'), 'rb') as fp:
                heavy_transaction_metadata = pickle.load(fp)

            for k in [
                    'data_preparation', 'rebuild_model', 'data_source', 'type',
                    'ignore_columns', 'sample_margin_of_error',
                    'sample_confidence_level', 'stop_training_in_x_seconds',
                    'stop_training_in_accuracy'
            ]:
                if old_lmd[k] is not None:
                    light_transaction_metadata[k] = old_lmd[k]

            for k in ['from_data', 'test_from_data']:
                if old_hmd[k] is not None:
                    heavy_transaction_metadata[k] = old_hmd[k]
        Transaction(session=self,
                    light_transaction_metadata=light_transaction_metadata,
                    heavy_transaction_metadata=heavy_transaction_metadata,
                    logger=self.log)