Beispiel #1
0
def export_predictor(model_name):
    """Exports a Predictor to a zip file in the CONFIG.MINDSDB_STORAGE_PATH directory.

    :param model: a Predictor
    :param model_name: this is the name of the model you wish to export (defaults to the name of the passed Predictor)
    """
    with MDBLock('shared', 'predict_' + model_name):
        storage_file = model_name + '.zip'
        with zipfile.ZipFile(storage_file, 'w') as zip_fp:
            for file_name in [
                    'heavy_model_metadata.pickle',
                    'light_model_metadata.pickle', 'lightwood_data'
            ]:
                full_path = os.path.join(CONFIG.MINDSDB_STORAGE_PATH,
                                         model_name, file_name)
                zip_fp.write(full_path, os.path.basename(full_path))

            # If the backend is ludwig, save the ludwig files
            try:
                ludwig_model_path = os.path.join(CONFIG.MINDSDB_STORAGE_PATH,
                                                 model_name, 'ludwig_data')
                for root, dirs, files in os.walk(ludwig_model_path):
                    for file in files:
                        full_path = os.path.join(root, file)
                        zip_fp.write(
                            full_path,
                            full_path[len(CONFIG.MINDSDB_STORAGE_PATH):])
            except Exception:
                pass

        print(f'Exported model to {storage_file}')
Beispiel #2
0
    def predict(self,
                when_data,
                use_gpu=None,
                advanced_args=None,
                backend=None,
                run_confidence_variation_analysis=False):
        """
        You have a mind trained already and you want to make a prediction

        :param when_data: python dict, file path, a pandas data frame, or url to a file that you want to predict from
        :param run_confidence_variation_analysis: Run a confidence variation analysis on each of the given input column, currently only works when making single predictions via `when`

        :return: TransactionOutputData object
        """
        with MDBLock('shared', 'learn_' + self.name):
            if advanced_args is None:
                advanced_args = {}
            if run_confidence_variation_analysis is True and isinstance(when_data, list) and len(when_data) > 1:
                error_msg = 'run_confidence_variation_analysis=True is a valid option only when predicting a single data point'
                self.log.error(error_msg)
                raise ValueError(error_msg)

            transaction_type = TRANSACTION_PREDICT
            when_ds = None
            when = None

            if isinstance(when_data, dict):
                when = [when_data]
            elif isinstance(when_data, list):
                when = when_data
            else:
                when_ds = None if when_data is None else getDS(when_data)

            disable_lightwood_transform_cache = False
            heavy_transaction_metadata = {}
            if when_ds is None:
                heavy_transaction_metadata['when_data'] = None
            else:
                heavy_transaction_metadata['when_data'] = when_ds
                _, _, disable_lightwood_transform_cache = _get_memory_optimizations(when_ds.df)
            heavy_transaction_metadata['when'] = when
            heavy_transaction_metadata['name'] = self.name

            if backend is not None:
                heavy_transaction_metadata['model_backend'] = backend

            light_transaction_metadata = dict(
                name = self.name,
                type = transaction_type,
                use_gpu = use_gpu,
                data_preparation = {},
                run_confidence_variation_analysis = run_confidence_variation_analysis,
                force_disable_cache = advanced_args.get('force_disable_cache', disable_lightwood_transform_cache),
                breakpoint=self.breakpoint
            )

            self.transaction = PredictTransaction(session=self,
                                    light_transaction_metadata=light_transaction_metadata,
                                    heavy_transaction_metadata=heavy_transaction_metadata)
            return self.transaction.output_data
Beispiel #3
0
    def test(self, when_data, accuracy_score_functions, score_using='predicted_value', predict_args=None):
        """
        :param when_data: use this when you have data in either a file, a pandas data frame, or url to a file that you want to predict from
        :param accuracy_score_functions: a single function or  a dictionary for the form `{f'{target_name}': acc_func}` for when we have multiple targets
        :param score_using: what values from the `explanation` of the target to use in the score function, defaults to the
        :param predict_args: dictionary of arguments to be passed to `predict`, e.g: `predict_args={'use_gpu': True}`

        :return: a dictionary for the form `{f'{target_name}_accuracy': accuracy_func_return}`, e.g. {'rental_price_accuracy':0.99}
        """
        with MDBLock('shared', 'learn_' + self.name):
            if predict_args is None:
                predict_args = {}

            predictions = self.predict(when_data=when_data, **predict_args)

            with open(os.path.join(CONFIG.MINDSDB_STORAGE_PATH, f'{self.name}_light_model_metadata.pickle'), 'rb') as fp:
                lmd = pickle.load(fp)

            accuracy_dict = {}
            for col in lmd['predict_columns']:
                if isinstance(accuracy_score_functions, dict):
                    acc_f = accuracy_score_functions[col]
                else:
                    acc_f = accuracy_score_functions

                if score_using is None:
                    predicted = [x.explanation[col] for x in predictions]
                else:
                    predicted = [x.explanation[col][score_using] for x in predictions]

                real = [x[f'__observed_{col}'] for x in predictions]
                accuracy_dict[f'{col}_accuracy'] = acc_f(real, predicted)

            return accuracy_dict
Beispiel #4
0
def test_exclusive_lock():
    lock = MDBLock('exclusive', 'name')

    with pytest.raises(ExceptionForTest):
        lock(func)()

    with pytest.raises(ExceptionForTest):
        with lock:
            func()
Beispiel #5
0
def _import_model_by_name(model_name):
    with MDBLock('exclusive', 'detele_' + model_name):
        with open(os.path.join(CONFIG.MINDSDB_STORAGE_PATH, model_name + '_light_model_metadata.pickle'), 'rb') as fp:
            lmd = pickle.load(fp)

        if 'ludwig_data' in lmd and 'ludwig_save_path' in lmd['ludwig_data']:
            lmd['ludwig_data']['ludwig_save_path'] = str(os.path.join(CONFIG.MINDSDB_STORAGE_PATH,os.path.basename(lmd['ludwig_data']['ludwig_save_path'])))

        if 'lightwood_data' in lmd and 'save_path' in lmd['lightwood_data']:
            lmd['lightwood_data']['save_path'] = str(os.path.join(CONFIG.MINDSDB_STORAGE_PATH,os.path.basename(lmd['lightwood_data']['save_path'])))

        with open(os.path.join(CONFIG.MINDSDB_STORAGE_PATH, model_name + '_light_model_metadata.pickle'), 'wb') as fp:
            pickle.dump(lmd, fp,protocol=pickle.HIGHEST_PROTOCOL)
Beispiel #6
0
def import_model(model_archive_path, new_name=None):
    """
    Import a mindsdb instance storage from an archive file.

    :param mindsdb_storage_dir: full_path that contains your mindsdb predictor zip file
    """
    previous_models = [
        str(p.name) for p in Path(CONFIG.MINDSDB_STORAGE_PATH).iterdir()
        if p.is_dir()
    ]
    extract_dir = tempfile.mkdtemp(dir=CONFIG.MINDSDB_STORAGE_PATH)
    shutil.unpack_archive(model_archive_path, extract_dir=extract_dir)

    try:
        lmd = load_lmd(os.path.join(extract_dir,
                                    'light_model_metadata.pickle'))
    except Exception:
        shutil.rmtree(extract_dir)
        raise

    if new_name is not None:
        lmd['name'] = new_name
    elif lmd['name'] is None:
        lmd['name'] = extract_dir

    if lmd['name'] in previous_models:
        shutil.rmtree(extract_dir)
        raise Exception(f"Model with name '{lmd['name']}' already exists.")

    shutil.move(extract_dir,
                os.path.join(CONFIG.MINDSDB_STORAGE_PATH, lmd['name']))

    with MDBLock('exclusive', 'detele_' + lmd['name']):
        if 'ludwig_data' in lmd and 'ludwig_save_path' in lmd['ludwig_data']:
            lmd['ludwig_data']['ludwig_save_path'] = os.path.join(
                CONFIG.MINDSDB_STORAGE_PATH, lmd['name'], 'ludwig_data')

        if 'lightwood_data' in lmd and 'save_path' in lmd['lightwood_data']:
            lmd['lightwood_data']['save_path'] = os.path.join(
                CONFIG.MINDSDB_STORAGE_PATH, lmd['name'], 'lightwood_data')

        with open(
                os.path.join(CONFIG.MINDSDB_STORAGE_PATH, lmd['name'],
                             'light_model_metadata.pickle'), 'wb') as fp:
            pickle.dump(lmd, fp, protocol=pickle.HIGHEST_PROTOCOL)

    print('Model files loaded')
Beispiel #7
0
def delete_model(model_name):
    """
    If you want to delete exported model files.

    :param model_name: name of the model
    :return: bool (True/False) True if model was deleted
    """
    with MDBLock('exclusive', 'delete_' + model_name):
        lmd = load_lmd(
            os.path.join(CONFIG.MINDSDB_STORAGE_PATH, model_name,
                         'light_model_metadata.pickle'))

        try:
            os.remove(lmd['lightwood_data']['save_path'])
        except Exception:
            pass

        try:
            shutil.rmtree(lmd['ludwig_data']['ludwig_save_path'])
        except Exception:
            pass

        shutil.rmtree(os.path.join(CONFIG.MINDSDB_STORAGE_PATH, model_name))
Beispiel #8
0
def delete_model(model_name):
    """
    If you want to delete exported model files.

    :param model_name: name of the model
    :return: bool (True/False) True if model was deleted
    """
    with MDBLock('exclusive', 'delete_' + model_name):
        with open(os.path.join(CONFIG.MINDSDB_STORAGE_PATH, model_name + '_light_model_metadata.pickle'), 'rb') as fp:
            lmd = pickle.load(fp)

            try:
                os.remove(lmd['lightwood_data']['save_path'])
            except Exception:
                pass

            try:
                shutil.rmtree(lmd['ludwig_data']['ludwig_save_path'])
            except Exception:
                pass

        for file_name in [model_name + '_heavy_model_metadata.pickle',
                        model_name + '_light_model_metadata.pickle']:
            os.remove(os.path.join(CONFIG.MINDSDB_STORAGE_PATH, file_name))
Beispiel #9
0
def get_model_data(model_name=None, lmd=None):
    if model_name is None and lmd is None:
        raise ValueError('provide either model name or lmd')

    if lmd is not None:
        pass
    elif model_name is not None:
        with MDBLock('shared', 'get_data_' + model_name):
            lmd = load_lmd(
                os.path.join(CONFIG.MINDSDB_STORAGE_PATH, model_name,
                             'light_model_metadata.pickle'))

    # ADAPTOR CODE
    amd = {}

    if 'tss' in lmd:
        if lmd['tss']['is_timeseries']:
            amd['timeseries'] = {}
            amd['timeseries']['user_settings'] = lmd['tss']
        else:
            amd['timeseries'] = None

    if 'stats_v2' in lmd:
        amd['data_analysis_v2'] = lmd['stats_v2']

    if lmd['current_phase'] == MODEL_STATUS_TRAINED:
        amd['status'] = 'complete'
    elif lmd['current_phase'] == MODEL_STATUS_ERROR:
        amd['status'] = 'error'
    else:
        amd['status'] = 'training'

    # Shared keys
    for k in [
            'name', 'version', 'is_active', 'data_source', 'predict',
            'current_phase', 'train_end_at', 'updated_at', 'created_at',
            'data_preparation', 'validation_set_accuracy'
    ]:
        if k == 'predict':
            amd[k] = lmd['predict_columns']
        elif k in lmd:
            amd[k] = lmd[k]
            if k == 'validation_set_accuracy':
                if lmd['validation_set_accuracy'] is not None:
                    amd['accuracy'] = round(lmd['validation_set_accuracy'], 3)
                else:
                    amd['accuracy'] = None
        else:
            amd[k] = None

    amd['data_analysis'] = {
        'target_columns_metadata': [],
        'input_columns_metadata': []
    }

    amd['model_analysis'] = []

    for col in lmd['model_columns_map'].keys():
        if col in lmd['columns_to_ignore']:
            continue

        try:
            icm = _adapt_column(lmd['stats_v2'][col], col)
        except Exception as e:
            print(e)
            icm = {'column_name': col}
            #continue

        amd['force_vectors'] = {}
        if col in lmd['predict_columns']:
            # Histograms for plotting the force vectors
            if 'all_columns_prediction_distribution' in lmd and lmd[
                    'all_columns_prediction_distribution'] is not None:
                amd['force_vectors'][col] = {}
                amd['force_vectors'][col]['normal_data_distribution'] = lmd[
                    'all_columns_prediction_distribution'][col]
                amd['force_vectors'][col]['normal_data_distribution'][
                    'type'] = 'categorical'

                amd['force_vectors'][col]['missing_data_distribution'] = {}
                for missing_column in lmd[
                        'columnless_prediction_distribution'][col]:
                    amd['force_vectors'][col]['missing_data_distribution'][
                        missing_column] = lmd[
                            'columnless_prediction_distribution'][col][
                                missing_column]
                    amd['force_vectors'][col]['missing_data_distribution'][
                        missing_column]['type'] = 'categorical'

                icm['importance_score'] = None
            amd['data_analysis']['target_columns_metadata'].append(icm)

            if 'confusion_matrices' in lmd and col in lmd['confusion_matrices']:
                confusion_matrix = lmd['confusion_matrices'][col]
            else:
                confusion_matrix = None

            if 'accuracy_samples' in lmd and col in lmd['accuracy_samples']:
                accuracy_samples = lmd['accuracy_samples'][col]
            else:
                accuracy_samples = None

            # Model analysis building for each of the predict columns
            mao = {
                'column_name': col,
                'overall_input_importance': {
                    "type": "categorical",
                    "x": [],
                    "y": []
                },
                "train_accuracy_over_time": {
                    "type": "categorical",
                    "x": [],
                    "y": []
                },
                "test_accuracy_over_time": {
                    "type": "categorical",
                    "x": [],
                    "y": []
                },
                "accuracy_histogram": {
                    "x": [],
                    "y": [],
                    'x_explained': []
                },
                "confusion_matrix": confusion_matrix,
                "accuracy_samples": accuracy_samples
            }

            # This is a check to see if model analysis has run on this data
            if 'model_accuracy' in lmd and lmd[
                    'model_accuracy'] is not None and 'train' in lmd[
                        'model_accuracy'] and 'combined' in lmd[
                            'model_accuracy']['train'] and lmd[
                                'model_accuracy']['train'][
                                    'combined'] is not None:
                train_acc = lmd['model_accuracy']['train']['combined']
                test_acc = lmd['model_accuracy']['test']['combined']

                for i in range(0, len(train_acc)):
                    mao['train_accuracy_over_time']['x'].append(i)
                    mao['train_accuracy_over_time']['y'].append(train_acc[i])

                for i in range(0, len(test_acc)):
                    mao['test_accuracy_over_time']['x'].append(i)
                    mao['test_accuracy_over_time']['y'].append([i])

            if 'model_accuracy' in lmd and lmd[
                    'model_accuracy'] is not None and lmd[
                        'column_importances'] is not None:
                mao['accuracy_histogram']['x'] = [
                    f'{x}' for x in lmd['accuracy_histogram'][col]['buckets']
                ]
                mao['accuracy_histogram']['y'] = lmd['accuracy_histogram'][
                    col]['accuracies']

                if lmd['columns_buckets_importances'] is not None and col in lmd[
                        'columns_buckets_importances']:
                    for output_col_bucket in lmd[
                            'columns_buckets_importances'][col]:
                        x_explained_member = []
                        for input_col in lmd['columns_buckets_importances'][
                                col][output_col_bucket]:
                            stats = lmd['columns_buckets_importances'][col][
                                output_col_bucket][input_col]
                            adapted_sub_incol = _adapt_column(stats, input_col)
                            x_explained_member.append(adapted_sub_incol)
                        mao['accuracy_histogram']['x_explained'].append(
                            x_explained_member)

                for icol in lmd['model_columns_map'].keys():
                    if icol in lmd['columns_to_ignore']:
                        continue
                    if icol not in lmd['predict_columns']:
                        try:
                            mao['overall_input_importance']['x'].append(icol)
                            mao['overall_input_importance']['y'].append(
                                round(lmd['column_importances'][icol], 1))
                        except Exception:
                            print(f'No column importances found for {icol} !')

            for key in [
                    'train_data_accuracy', 'test_data_accuracy',
                    'valid_data_accuracy'
            ]:
                if key in lmd:
                    mao[key] = lmd[key]

            amd['model_analysis'].append(mao)
        else:
            if 'column_importances' in lmd and lmd[
                    'column_importances'] is not None:
                if not lmd['tss']['is_timeseries'] or col not in lmd['tss'][
                        'order_by']:
                    icm['importance_score'] = lmd['column_importances'][col]
            amd['data_analysis']['input_columns_metadata'].append(icm)

    return amd
Beispiel #10
0
def rename_model(old_model_name, new_model_name):
    """
    If you want to rename an exported model.

    :param old_model_name: this is the name of the model you wish to rename
    :param new_model_name: this is the new name of the model
    :return: bool (True/False) True if predictor was renamed successfully
    """
    lock1 = MDBLock('exclusive', 'delete_' + new_model_name)
    lock2 = MDBLock('exclusive', 'delete_' + old_model_name)
    with lock1, lock2:

        if old_model_name == new_model_name:
            return True

        moved_a_backend = False
        for extension in ['lightwood_data', 'ludwig_data']:
            shutil.move(
                os.path.join(CONFIG.MINDSDB_STORAGE_PATH, old_model_name,
                             extension),
                os.path.join(CONFIG.MINDSDB_STORAGE_PATH, new_model_name,
                             extension))
            moved_a_backend = True

        if not moved_a_backend:
            return False

        lmd = load_lmd(
            os.path.join(CONFIG.MINDSDB_STORAGE_PATH, old_model_name,
                         'light_model_metadata.pickle'))
        hmd = load_lmd(
            os.path.join(CONFIG.MINDSDB_STORAGE_PATH, old_model_name,
                         'heavy_model_metadata.pickle'))

        lmd['name'] = new_model_name
        hmd['name'] = new_model_name

        renamed_one_backend = False
        try:
            lmd['ludwig_data']['ludwig_save_path'] = lmd['ludwig_data'][
                'ludwig_save_path'].replace(old_model_name, new_model_name)
            renamed_one_backend = True
        except Exception:
            pass

        try:
            lmd['lightwood_data']['save_path'] = lmd['lightwood_data'][
                'save_path'].replace(old_model_name, new_model_name)
            renamed_one_backend = True
        except Exception:
            pass

        if not renamed_one_backend:
            return False

        with open(
                os.path.join(CONFIG.MINDSDB_STORAGE_PATH, new_model_name,
                             'light_model_metadata.pickle'), 'wb') as fp:
            pickle.dump(lmd, fp, protocol=pickle.HIGHEST_PROTOCOL)

        with open(
                os.path.join(CONFIG.MINDSDB_STORAGE_PATH, new_model_name,
                             'heavy_model_metadata.pickle'), 'wb') as fp:
            pickle.dump(hmd, fp, protocol=pickle.HIGHEST_PROTOCOL)

        os.remove(
            os.path.join(CONFIG.MINDSDB_STORAGE_PATH, old_model_name,
                         'light_model_metadata.pickle'))
        os.remove(
            os.path.join(CONFIG.MINDSDB_STORAGE_PATH, old_model_name,
                         'heavy_model_metadata.pickle'))
        return True
Beispiel #11
0
    def learn(self,
              to_predict,
              from_data,
              timeseries_settings=None,
              ignore_columns=None,
              stop_training_in_x_seconds=None,
              backend='lightwood',
              rebuild_model=True,
              use_gpu=None,
              equal_accuracy_for_all_output_categories=True,
              output_categories_importance_dictionary=None,
              advanced_args=None,
              sample_settings=None):
        """
        Learn to predict a column or columns from the data in 'from_data'

        Mandatory arguments:
        :param to_predict: what column or columns you want to predict
        :param from_data: the data that you want to learn from, this can be either a file, a pandas data frame, or url or a mindsdb data source

        Optional Time series arguments:
        :param timeseries_settings: dictionary of options for handling the data as a timeseries

        Optional data transformation arguments:
        :param ignore_columns: mindsdb will ignore this column

        Optional sampling parameters:
        :param sample_settings: dictionary of options for sampling from dataset.
            Includes `sample_for_analysis`. `sample_for_training`, `sample_margin_of_error`, `sample_confidence_level`, `sample_percentage`, `sample_function`.
            Default values depend on the size of input dataset and available memory.
            Generally, the bigger the dataset, the more sampling is used.

        Optional debug arguments:
        :param stop_training_in_x_seconds: (default None), if set, you want training to finish in a given number of seconds

        :return:
        """

        with MDBLock('exclusive', 'learn_' + self.name):
            ignore_columns = [] if ignore_columns is None else ignore_columns
            timeseries_settings = {} if timeseries_settings is None else timeseries_settings
            advanced_args = {} if advanced_args is None else advanced_args

            predict_columns = to_predict if isinstance(to_predict,
                                                       list) else [to_predict]
            ignore_columns = ignore_columns if isinstance(
                ignore_columns, list) else [ignore_columns]
            if len(predict_columns) == 0:
                error = 'You need to specify the column[s] you want to predict via the `to_predict` argument!'
                self.log.error(error)
                raise ValueError(error)

            from_ds = getDS(from_data)

            # Set user-provided subtypes
            from_ds.set_subtypes(advanced_args.get('subtypes', {}))

            transaction_type = TRANSACTION_LEARN

            sample_for_analysis, sample_for_training, disable_lightwood_transform_cache = _get_memory_optimizations(
                from_ds.df)
            sample_settings, sample_function = _prepare_sample_settings(
                sample_settings, sample_for_analysis, sample_for_training)

            timeseries_settings = _prepare_timeseries_settings(
                timeseries_settings)

            self.log.warning(f'Sample for analysis: {sample_for_analysis}')
            self.log.warning(f'Sample for training: {sample_for_training}')
            """
            We don't implement "name" as a concept in mindsdbd data sources, this is only available for files,
            the server doesn't handle non-file data sources at the moment, so this shouldn't prove an issue,
            once we want to support datasources such as s3 and databases for the server we need to add name as a concept (or, preferably, before that)
            """
            data_source_name = from_ds.name()

            heavy_transaction_metadata = dict(name=self.name,
                                              from_data=from_ds,
                                              predictions=None,
                                              model_backend=backend,
                                              sample_function=sample_function,
                                              from_data_type=type(from_ds))

            light_transaction_metadata = dict(
                version=str(__version__),
                name=self.name,
                data_preparation={},
                predict_columns=predict_columns,
                model_columns_map=from_ds._col_map,
                tss=timeseries_settings,
                data_source=data_source_name,
                type=transaction_type,
                sample_settings=sample_settings,
                stop_training_in_x_seconds=stop_training_in_x_seconds,
                rebuild_model=rebuild_model,
                model_accuracy={
                    'train': {},
                    'test': {}
                },
                column_importances=None,
                columns_buckets_importances=None,
                columnless_prediction_distribution=None,
                all_columns_prediction_distribution=None,
                use_gpu=use_gpu,
                columns_to_ignore=ignore_columns,
                validation_set_accuracy=None,
                lightwood_data={},
                ludwig_data={},
                weight_map={},
                confusion_matrices={},
                empty_columns=[],
                data_types={},
                data_subtypes={},
                equal_accuracy_for_all_output_categories=
                equal_accuracy_for_all_output_categories,
                output_categories_importance_dictionary=
                output_categories_importance_dictionary
                if output_categories_importance_dictionary is not None else {},
                force_disable_cache=advanced_args.get(
                    'force_disable_cache', disable_lightwood_transform_cache),
                force_categorical_encoding=advanced_args.get(
                    'force_categorical_encoding', []),
                force_column_usage=advanced_args.get('force_column_usage', []),
                use_selfaware_model=advanced_args.get('use_selfaware_model',
                                                      True),
                deduplicate_data=advanced_args.get('deduplicate_data', True),
                null_values=advanced_args.get('null_values', {}),
                data_split_indexes=advanced_args.get('data_split_indexes',
                                                     None),
                tags_delimiter=advanced_args.get('tags_delimiter', ','),
                force_predict=advanced_args.get('force_predict', False),
                mixer_class=advanced_args.get('use_mixers', None),
                breakpoint=self.breakpoint,
                setup_args=from_data.setup_args if hasattr(
                    from_data, 'setup_args') else None)

            if rebuild_model is False:
                old_lmd = {}
                for k in light_transaction_metadata:
                    old_lmd[k] = light_transaction_metadata[k]

                old_hmd = {}
                for k in heavy_transaction_metadata:
                    old_hmd[k] = heavy_transaction_metadata[k]

                light_transaction_metadata = load_lmd(
                    os.path.join(CONFIG.MINDSDB_STORAGE_PATH,
                                 light_transaction_metadata['name'],
                                 'light_model_metadata.pickle'))

                heavy_transaction_metadata = load_hmd(
                    os.path.join(CONFIG.MINDSDB_STORAGE_PATH,
                                 heavy_transaction_metadata['name'],
                                 'heavy_model_metadata.pickle'))

                for k in [
                        'data_preparation', 'rebuild_model', 'data_source',
                        'type', 'columns_to_ignore', 'sample_margin_of_error',
                        'sample_confidence_level', 'stop_training_in_x_seconds'
                ]:
                    if old_lmd[k] is not None:
                        light_transaction_metadata[k] = old_lmd[k]

                if old_hmd['from_data'] is not None:
                    heavy_transaction_metadata['from_data'] = old_hmd[
                        'from_data']

            self.transaction = LearnTransaction(
                session=self,
                light_transaction_metadata=light_transaction_metadata,
                heavy_transaction_metadata=heavy_transaction_metadata,
                logger=self.log)
Beispiel #12
0
    def learn(self,
              to_predict,
              from_data,
              group_by=None,    # @TODO: Move this logic to datasource
              window_size=None, # @TODO: Move this logic to datasource
              order_by=None,    # @TODO: Move this logic to datasource
              ignore_columns=None,
              stop_training_in_x_seconds=None,
              backend='lightwood',
              rebuild_model=True,
              use_gpu=None,
              equal_accuracy_for_all_output_categories=True,
              output_categories_importance_dictionary=None,
              advanced_args=None,
              sample_settings=None):
        """
        Learn to predict a column or columns from the data in 'from_data'

        Mandatory arguments:
        :param to_predict: what column or columns you want to predict
        :param from_data: the data that you want to learn from, this can be either a file, a pandas data frame, or url or a mindsdb data source

        Optional Time series arguments:
        :param order_by: this order by defines the time series, it can be a list. By default it sorts each sort by column in ascending manner, if you want to change this pass a touple ('column_name', 'boolean_for_ascending <default=true>')
        :param group_by: This argument tells the time series that it should learn by grouping rows by a given id
        :param window_size: The number of samples to learn from in the time series

        Optional data transformation arguments:
        :param ignore_columns: mindsdb will ignore this column

        Optional sampling parameters:
        :param sample_settings: dictionary of options for sampling from dataset.
            Includes `sample_for_analysis`. `sample_for_training`, `sample_margin_of_error`, `sample_confidence_level`, `sample_percentage`, `sample_function`.
            Default values depend on the size of input dataset and available memory.
            Generally, the bigger the dataset, the more sampling is used.

        Optional debug arguments:
        :param stop_training_in_x_seconds: (default None), if set, you want training to finish in a given number of seconds

        :return:
        """
        with MDBLock('exclusive', 'learn_' + self.name):

            if ignore_columns is None:
                ignore_columns = []

            if group_by is None:
                group_by = []

            if order_by is None:
                order_by = []

            # lets turn into lists: predict, ignore, group_by, order_by
            predict_columns = to_predict if isinstance(to_predict, list) else [to_predict]
            ignore_columns = ignore_columns if isinstance(ignore_columns, list) else [ignore_columns]
            group_by = group_by if isinstance(group_by, list) else [group_by]
            order_by = order_by if isinstance(order_by, list) else [order_by]

            # lets turn order by into list of tuples if not already
            # each element ('column_name', 'boolean_for_ascending <default=true>')
            order_by = [col_name if isinstance(col_name, tuple) else (col_name, True) for col_name in order_by]

            if advanced_args is None:
                advanced_args = {}

            from_ds = getDS(from_data)
            
            # Set user-provided subtypes
            from_ds.set_subtypes(
                advanced_args.get('subtypes', {})
            )

            transaction_type = TRANSACTION_LEARN

            sample_for_analysis, sample_for_training, disable_lightwood_transform_cache = _get_memory_optimizations(
                from_ds.df)
            sample_settings, sample_function = _prepare_sample_settings(sample_settings,
                                                        sample_for_analysis,
                                                        sample_for_training)

            self.log.warning(f'Sample for analysis: {sample_for_analysis}')
            self.log.warning(f'Sample for training: {sample_for_training}')

            if len(predict_columns) == 0:
                error = 'You need to specify a column to predict'
                self.log.error(error)
                raise ValueError(error)

            is_time_series = True if len(order_by) > 0 else False

            """
            We don't implement "name" as a concept in mindsdbd data sources, this is only available for files,
            the server doesn't handle non-file data sources at the moment, so this shouldn't prove an issue,
            once we want to support datasources such as s3 and databases for the server we need to add name as a concept (or, preferably, before that)
            """
            data_source_name = from_ds.name()

            heavy_transaction_metadata = dict(
                name=self.name,
                from_data=from_ds,
                predictions= None,
                model_backend= backend,
                sample_function=sample_function
            )

            light_transaction_metadata = dict(
                version = str(__version__),
                name = self.name,
                data_preparation = {},
                predict_columns = predict_columns,
                model_columns_map = from_ds._col_map,
                model_group_by = group_by,
                model_order_by = order_by,
                model_is_time_series = is_time_series,
                data_source = data_source_name,
                type = transaction_type,
                window_size = window_size,
                sample_settings = sample_settings,
                stop_training_in_x_seconds = stop_training_in_x_seconds,
                rebuild_model = rebuild_model,
                model_accuracy = {'train': {}, 'test': {}},
                column_importances = None,
                columns_buckets_importances = None,
                columnless_prediction_distribution = None,
                all_columns_prediction_distribution = None,
                use_gpu = use_gpu,
                columns_to_ignore = ignore_columns,
                validation_set_accuracy = None,
                lightwood_data = {},
                ludwig_data = {},
                weight_map = {},
                confusion_matrices = {},
                empty_columns = [],
                data_types = {},
                data_subtypes = {},
                equal_accuracy_for_all_output_categories = equal_accuracy_for_all_output_categories,
                output_categories_importance_dictionary = output_categories_importance_dictionary if output_categories_importance_dictionary is not None else {},

                force_disable_cache = advanced_args.get('force_disable_cache', disable_lightwood_transform_cache),
                force_categorical_encoding = advanced_args.get('force_categorical_encoding', []),
                handle_foreign_keys = advanced_args.get('handle_foreign_keys', True),
                use_selfaware_model = advanced_args.get('use_selfaware_model', True),
                deduplicate_data = advanced_args.get('deduplicate_data', True),
                null_values = advanced_args.get('null_values', {}),
                data_split_indexes = advanced_args.get('data_split_indexes', None),
                tags_delimiter = advanced_args.get('tags_delimiter', ','),
                force_predict = advanced_args.get('force_predict', False),

                breakpoint=self.breakpoint
            )

            if rebuild_model is False:
                old_lmd = {}
                for k in light_transaction_metadata: old_lmd[k] = light_transaction_metadata[k]

                old_hmd = {}
                for k in heavy_transaction_metadata: old_hmd[k] = heavy_transaction_metadata[k]

                with open(os.path.join(CONFIG.MINDSDB_STORAGE_PATH, light_transaction_metadata['name'] + '_light_model_metadata.pickle'), 'rb') as fp:
                    light_transaction_metadata = pickle.load(fp)

                with open(os.path.join(CONFIG.MINDSDB_STORAGE_PATH, heavy_transaction_metadata['name'] + '_heavy_model_metadata.pickle'), 'rb') as fp:
                    heavy_transaction_metadata= pickle.load(fp)

                for k in ['data_preparation', 'rebuild_model', 'data_source', 'type', 'columns_to_ignore', 'sample_margin_of_error', 'sample_confidence_level', 'stop_training_in_x_seconds']:
                    if old_lmd[k] is not None: light_transaction_metadata[k] = old_lmd[k]

                if old_hmd['from_data'] is not None:
                    heavy_transaction_metadata['from_data'] = old_hmd['from_data']

            self.transaction = LearnTransaction(session=self,
                        light_transaction_metadata=light_transaction_metadata,
                        heavy_transaction_metadata=heavy_transaction_metadata,
                        logger=self.log)