def test_explain_prediction(self):
        mdb = Predictor(name='test_explain_prediction')

        n_points = 100
        input_dataframe = pd.DataFrame(
            {
                'numeric_x': list(range(n_points)),
                'categorical_x': [int(x % 2 == 0) for x in range(n_points)],
            },
            index=list(range(n_points)))

        input_dataframe[
            'numeric_y'] = input_dataframe.numeric_x + 2 * input_dataframe.categorical_x

        mdb.learn(from_data=input_dataframe,
                  to_predict='numeric_y',
                  stop_training_in_x_seconds=1,
                  use_gpu=False,
                  advanced_args={'force_predict': True})

        # Test predicting using a data frame
        result = mdb.predict(when_data=pd.DataFrame([{
            "numeric_x": 10,
            'categorical_x': 1
        }]))
        explanation_new = result[0].explanation['numeric_y']
        assert isinstance(explanation_new['predicted_value'], int)
        assert isinstance(explanation_new['confidence_interval'], list)
        assert isinstance(explanation_new['confidence_interval'][0], float)
        assert isinstance(explanation_new['important_missing_information'],
                          list)
        assert isinstance(explanation_new['prediction_quality'], str)

        assert len(str(result[0])) > 20
Beispiel #2
0
    def test_predictor_deduplicate_data(self):
        n_points = 100
        input_dataframe = pd.DataFrame({
            'numeric_int': [x % 44 for x in list(range(n_points))],
            'numeric_int_2': [x % 20 for x in list(range(n_points))],
        }, index=list(range(n_points)))
        input_dataframe['y'] = input_dataframe['numeric_int'] % 10

        # Add duplicate row
        input_dataframe = input_dataframe.append(input_dataframe.iloc[99], ignore_index=True)

        mdb = Predictor(name='test_drop_duplicates')
        mdb.learn(
            from_data=input_dataframe,
            to_predict='y',
            stop_training_in_x_seconds=1,
            use_gpu=False
        )

        model_data = F.get_model_data('test_drop_duplicates')

        # Ensure duplicate row was not used for training, or analysis

        assert model_data['data_preparation']['total_row_count'] == n_points
        assert model_data['data_preparation']['used_row_count'] <= n_points

        assert sum([model_data['data_preparation']['train_row_count'],
                   model_data['data_preparation']['validation_row_count'],
                   model_data['data_preparation']['test_row_count']]) == n_points

        assert sum([mdb.transaction.input_data.train_df.shape[0],
                    mdb.transaction.input_data.test_df.shape[0],
                    mdb.transaction.input_data.validation_df.shape[0]]) == n_points

        # Disable deduplication and ensure the duplicate row is used
        mdb = Predictor(name='test_drop_duplicates')
        mdb.learn(
            from_data=input_dataframe,
            to_predict='y',
            stop_training_in_x_seconds=1,
            use_gpu=False,
            advanced_args={
                'deduplicate_data': False
            }
        )

        model_data = F.get_model_data('test_drop_duplicates')

        # Duplicate row was used for analysis and training

        assert model_data['data_preparation']['total_row_count'] == n_points+1
        assert model_data['data_preparation']['used_row_count'] <= n_points+1

        assert sum([model_data['data_preparation']['train_row_count'],
                    model_data['data_preparation']['validation_row_count'],
                    model_data['data_preparation']['test_row_count']]) == n_points+1

        assert sum([mdb.transaction.input_data.train_df.shape[0],
                    mdb.transaction.input_data.test_df.shape[0],
                    mdb.transaction.input_data.validation_df.shape[0]]) == n_points+1
    def test_sample_for_training(self):
        predictor = Predictor(name='test')

        n_points = 100
        input_dataframe = pd.DataFrame(
            {
                'numeric_int': [x % 10 for x in list(range(n_points))],
                'categorical_binary': [0, 1] * (n_points // 2),
            },
            index=list(range(n_points)))
        input_dataframe[
            'y'] = input_dataframe.numeric_int + input_dataframe.numeric_int * input_dataframe.categorical_binary

        mock_function = PickableMock(spec=sample_data, wraps=sample_data)
        setattr(mock_function, '__name__', 'mock_sample_data')
        with mock.patch(
                'mindsdb_native.libs.controllers.predictor.sample_data',
                mock_function):

            predictor.learn(from_data=input_dataframe,
                            to_predict='y',
                            backend='lightwood',
                            sample_settings={
                                'sample_for_training': True,
                                'sample_for_analysis': True
                            },
                            stop_training_in_x_seconds=1,
                            use_gpu=False)

            assert mock_function.called

            # 1 call when sampling for analysis
            # 1 call when sampling training data for lightwood
            # 1 call when sampling testing data for lightwood
            assert mock_function.call_count == 3
Beispiel #4
0
    def test_ignore_foreign_keys(self):
        input_dataframe = pd.DataFrame({
            'do_use': list(range(100)),
            'numeric_id': list(range(100)),
            'y': list(range(100)),
        })

        predictor = Predictor(name='test')
        predictor.learn(from_data=input_dataframe,
                        to_predict='y',
                        stop_training_in_x_seconds=1,
                        use_gpu=False)

        transaction = predictor.transaction

        assert 'do_use' in transaction.input_data.train_df.columns
        # Foreign key is ignored and removed from data frames
        assert 'numeric_id' not in transaction.input_data.train_df.columns
        assert 'numeric_id' in transaction.lmd['columns_to_ignore']

        predictor = Predictor(name='test')
        predictor.learn(from_data=input_dataframe,
                        to_predict='y',
                        stop_training_in_x_seconds=1,
                        advanced_args={'handle_foreign_keys': False},
                        use_gpu=False)

        transaction = predictor.transaction

        assert 'do_use' in transaction.input_data.train_df.columns
        assert 'numeric_id' in transaction.input_data.train_df.columns
        assert 'numeric_id' not in transaction.lmd['columns_to_ignore']
    def test_category_tags_output(self):
        vocab = random.sample(SMALL_VOCAB, 10)
        vocab = {i: word for i, word in enumerate(vocab)}
        # x1 contains the index of first tag present
        # x2 contains the index of second tag present
        # if a tag is missing then x1/x2 contain -1 instead
        # Thus the dataset should be perfectly predicted
        n_points = 5000
        x1 = [
            random.randint(0,
                           len(vocab) - 1) if random.random() > 0.1 else -1
            for i in range(n_points)
        ]
        x2 = [
            random.randint(0,
                           len(vocab) - 1) if random.random() > 0.1 else -1
            for i in range(n_points)
        ]
        tags = []
        for x1_index, x2_index in zip(x1, x2):
            row_tags = set([vocab.get(x1_index), vocab.get(x2_index)])
            row_tags = [x for x in row_tags if x is not None]
            tags.append(','.join(row_tags))

        df = pd.DataFrame({'x1': x1, 'x2': x2, 'tags': tags})

        df_train = df.iloc[:round(n_points * 0.9)]
        df_test = df.iloc[round(n_points * 0.9):]

        predictor = Predictor('test')

        predictor.learn(from_data=df_train,
                        to_predict='tags',
                        advanced_args=dict(deduplicate_data=False),
                        stop_training_in_x_seconds=60,
                        use_gpu=False)

        model_data = F.get_model_data('test')
        assert model_data['data_analysis_v2']['tags']['typing'][
            'data_type'] == DATA_TYPES.CATEGORICAL
        assert model_data['data_analysis_v2']['tags']['typing'][
            'data_subtype'] == DATA_SUBTYPES.TAGS

        predictions = predictor.predict(when_data=df_test)
        test_tags = df_test.tags.apply(lambda x: x.split(','))

        predicted_tags = []
        for i in range(len(predictions)):
            predicted_tags.append(predictions[i]['tags'])

        test_tags_encoded = predictor.transaction.model_backend.predictor._mixer.encoders[
            'tags'].encode(test_tags)
        pred_labels_encoded = predictor.transaction.model_backend.predictor._mixer.encoders[
            'tags'].encode(predicted_tags)
        score = f1_score(test_tags_encoded,
                         pred_labels_encoded,
                         average='weighted')

        assert score >= 0.3
    def test_timeseries(self, tmp_path):
        ts_hours = 12
        data_len = 120
        train_file_name = os.path.join(str(tmp_path), 'train_data.csv')
        test_file_name = os.path.join(str(tmp_path), 'test_data.csv')

        features = generate_value_cols(['date', 'int'], data_len,
                                       ts_hours * 3600)
        labels = [generate_timeseries_labels(features)]

        feature_headers = list(map(lambda col: col[0], features))
        label_headers = list(map(lambda col: col[0], labels))

        # Create the training dataset and save it to a file
        columns_train = list(
            map(lambda col: col[1:int(len(col) * 3 / 4)], features))
        columns_train.extend(
            list(map(lambda col: col[1:int(len(col) * 3 / 4)], labels)))
        columns_to_file(columns_train,
                        train_file_name,
                        headers=[*feature_headers, *label_headers])
        # Create the testing dataset and save it to a file
        columns_test = list(
            map(lambda col: col[int(len(col) * 3 / 4):], features))
        columns_to_file(columns_test, test_file_name, headers=feature_headers)

        mdb = Predictor(name='test_timeseries')

        mdb.learn(from_data=train_file_name,
                  to_predict=label_headers,
                  timeseries_settings={
                      'order_by': [feature_headers[0]],
                      'window': 3
                  },
                  stop_training_in_x_seconds=10,
                  use_gpu=False,
                  advanced_args={'force_predict': True})

        results = mdb.predict(when_data=test_file_name, use_gpu=False)

        for row in results:
            expect_columns = [
                label_headers[0], label_headers[0] + '_confidence'
            ]
            for col in expect_columns:
                assert col in row

        models = F.get_models()
        model_data = F.get_model_data(models[0]['name'])
        assert model_data
Beispiel #7
0
def validate(to_predict, from_data, accuracy_score_functions, learn_args=None, test_args=None):
            if learn_args is None: learn_args = {}
            if test_args is None: test_args = {}
            
            name = str(uuid.uuid4()).replace('-','')
            predictor = Predictor(name)
            
            predictor.learn(to_predict, from_data, **learn_args)
            validation_data = predictor.transaction.input_data.validation_df

            accuracy = predictor.test(when_data=validation_data, accuracy_score_functions=accuracy_score_functions, **test_args)

            delete_model(name) 

            return accuracy
    def test_category_tags_input(self):
        vocab = random.sample(SMALL_VOCAB, 10)
        # tags contains up to 2 randomly selected tags
        # y contains the sum of indices of tags
        # the dataset should be nearly perfectly predicted
        n_points = 5000
        tags = []
        y = []
        for i in range(n_points):
            row_tags = []
            row_y = 0
            for k in range(2):
                if random.random() > 0.2:
                    selected_index = random.randint(0, len(vocab) - 1)
                    if vocab[selected_index] not in row_tags:
                        row_tags.append(vocab[selected_index])
                        row_y += selected_index
            tags.append(','.join(row_tags))
            y.append(row_y)

        df = pd.DataFrame({'tags': tags, 'y': y})

        df_train = df.iloc[:round(n_points * 0.9)]
        df_test = df.iloc[round(n_points * 0.9):]

        predictor = Predictor(name='test')

        predictor.learn(from_data=df_train,
                        to_predict='y',
                        advanced_args=dict(deduplicate_data=False),
                        stop_training_in_x_seconds=40,
                        use_gpu=False)

        model_data = F.get_model_data('test')
        assert model_data['data_analysis_v2']['tags']['typing'][
            'data_type'] == DATA_TYPES.CATEGORICAL
        assert model_data['data_analysis_v2']['tags']['typing'][
            'data_subtype'] == DATA_SUBTYPES.TAGS

        predictions = predictor.predict(when_data=df_test)
        test_y = df_test.y.apply(str)

        predicted_y = []
        for i in range(len(predictions)):
            predicted_y.append(predictions[i]['y'])

        score = accuracy_score(test_y, predicted_y)
        assert score >= 0.2
Beispiel #9
0
    def test_multilabel_prediction(self, tmp_path):
        train_file_name = os.path.join(str(tmp_path), 'train_data.csv')
        test_file_name = os.path.join(str(tmp_path), 'test_data.csv')
        data_len = 60

        features = generate_value_cols(['int', 'float', 'int', 'float'], data_len)
        labels = []
        labels.append(generate_log_labels(features))
        labels.append(generate_timeseries_labels(features))

        feature_headers = list(map(lambda col: col[0], features))
        label_headers = list(map(lambda col: col[0], labels))

        # Create the training dataset and save it to a file
        columns_train = list(
            map(lambda col: col[1:int(len(col) * 3 / 4)], features))
        columns_train.extend(
            list(map(lambda col: col[1:int(len(col) * 3 / 4)], labels)))
        columns_to_file(columns_train, train_file_name,
                        headers=[*feature_headers, *label_headers])

        # Create the testing dataset and save it to a file
        columns_test = list(
            map(lambda col: col[int(len(col) * 3 / 4):], features))
        columns_to_file(columns_test, test_file_name,
                        headers=feature_headers)

        mdb = Predictor(name='test_multilabel_prediction')
        mdb.learn(
            from_data=train_file_name,
            to_predict=label_headers,
            stop_training_in_x_seconds=1,
            use_gpu=False,
            advanced_args={'force_predict': True}
        )

        results = mdb.predict(when_data=test_file_name)
        models = F.get_models()
        model_data = F.get_model_data(models[0]['name'])
        assert model_data

        for i in range(len(results)):
            row = results[i]
            for label in label_headers:
                expect_columns = [label, label + '_confidence']
                for col in expect_columns:
                    assert col in row
    def test_ignore_columns(self):
        input_dataframe = pd.DataFrame({
            'do_use': list(range(100)),
            'y': list(range(100)),
            'ignore_this': list(range(100, 0, -1))
        })

        predictor = Predictor(name='test')
        predictor.learn(from_data=input_dataframe,
                        to_predict='y',
                        ignore_columns=['ignore_this'],
                        stop_training_in_x_seconds=1,
                        use_gpu=False,
                        advanced_args={'force_column_usage': ['do_use']})
        transaction = predictor.transaction

        assert 'do_use' in transaction.input_data.train_df.columns
        assert 'ignore_this' not in transaction.input_data.train_df.columns
    def test_ignore_identifiers(self):
        input_dataframe = pd.DataFrame({
            'do_use': [*range(60), *range(40)],
            'numeric_id':
            list(range(100)),
            'malicious_naming':
            list(range(99)) + [200],
            'y':
            list(range(100)),
        })

        predictor = Predictor(name='test')
        predictor.learn(from_data=input_dataframe,
                        to_predict='y',
                        stop_training_in_x_seconds=1,
                        use_gpu=False)

        transaction = predictor.transaction

        assert 'do_use' in transaction.input_data.train_df.columns
        # Foreign key is ignored and removed from data frames
        assert 'numeric_id' not in transaction.input_data.train_df.columns
        assert 'numeric_id' in transaction.lmd['columns_to_ignore']
        assert 'malicious_naming' not in transaction.input_data.train_df.columns
        assert 'malicious_naming' in transaction.lmd['columns_to_ignore']

        predictor = Predictor(name='test')
        predictor.learn(from_data=input_dataframe,
                        to_predict='y',
                        stop_training_in_x_seconds=1,
                        advanced_args={'force_column_usage': ['numeric_id']},
                        use_gpu=False)

        transaction = predictor.transaction

        assert 'do_use' in transaction.input_data.train_df.columns
        assert 'numeric_id' in transaction.input_data.train_df.columns
        assert 'numeric_id' not in transaction.lmd['columns_to_ignore']
    def test_user_provided_split_indices(self):
        predictor = Predictor('test')

        df = pd.DataFrame({'col_a': [*range(100)], 'col_b': [*range(100)]})

        predictor.learn(from_data=df,
                        to_predict='col_b',
                        advanced_args={
                            'data_split_indexes': {
                                'validation_indexes': [*range(0, 30)],
                                'train_indexes': [*range(30, 60)],
                                'test_indexes': [*range(60, 100)]
                            },
                            'force_column_usage': ['col_a', 'col_b']
                        },
                        use_gpu=False)

        assert set(predictor.transaction.input_data.train_df['col_a'].tolist()
                   ) == set(range(30, 60))
        assert set(
            predictor.transaction.input_data.test_df['col_a'].tolist()) == set(
                range(60, 100))
        assert set(predictor.transaction.input_data.validation_df['col_a'].
                   tolist()) == set(range(0, 30))
    def test_house_pricing(self, use_gpu):
        """
        Tests whole pipeline from downloading the dataset to making predictions and explanations.
        """
        # Create & Learn
        name = 'home_rentals_price'
        mdb = Predictor(name=name)
        mdb.learn(
            to_predict='rental_price',
            from_data=
            "https://s3.eu-west-2.amazonaws.com/mindsdb-example-data/home_rentals.csv",
            backend='lightwood',
            stop_training_in_x_seconds=80,
            use_gpu=use_gpu)

        def assert_prediction_interface(predictions):
            for prediction in predictions:
                assert hasattr(prediction, 'explanation')

        test_results = mdb.test(
            when_data=
            "https://s3.eu-west-2.amazonaws.com/mindsdb-example-data/home_rentals.csv",
            accuracy_score_functions=r2_score,
            predict_args={'use_gpu': use_gpu})
        assert test_results['rental_price_accuracy'] >= 0.8

        predictions = mdb.predict(
            when_data=
            "https://s3.eu-west-2.amazonaws.com/mindsdb-example-data/home_rentals.csv",
            use_gpu=use_gpu)
        assert_prediction_interface(predictions)
        predictions = mdb.predict(when_data={'sqft': 300}, use_gpu=use_gpu)
        assert_prediction_interface(predictions)

        amd = F.get_model_data(name)
        assert isinstance(json.dumps(amd), str)

        for k in [
                'status', 'name', 'version', 'data_source', 'current_phase',
                'updated_at', 'created_at', 'train_end_at'
        ]:
            assert isinstance(amd[k], str)

        assert isinstance(amd['predict'], (list, str))
        assert isinstance(amd['is_active'], bool)

        for k in ['validation_set_accuracy', 'accuracy']:
            assert isinstance(amd[k], float)

        for k in amd['data_preparation']:
            assert isinstance(amd['data_preparation'][k], (int, float))

        for k in amd['data_analysis']:
            assert (len(amd['data_analysis'][k]) > 0)
            assert isinstance(amd['data_analysis'][k][0], dict)

        model_analysis = amd['model_analysis']
        assert (len(model_analysis) > 0)
        assert isinstance(model_analysis[0], dict)
        input_importance = model_analysis[0]["overall_input_importance"]
        assert (len(input_importance) > 0)
        assert isinstance(input_importance, dict)

        for k in ['train', 'test', 'valid']:
            assert isinstance(model_analysis[0][k + '_data_accuracy'], dict)
            assert len(model_analysis[0][k + '_data_accuracy']) == 1
            assert model_analysis[0][k +
                                     '_data_accuracy']['rental_price'] > 0.4

        for column, importance in zip(input_importance["x"],
                                      input_importance["y"]):
            assert isinstance(column, str)
            assert (len(column) > 0)
            assert isinstance(importance, (float, int))
            assert (importance >= 0 and importance <= 10)

        # Test confidence estimation after save -> load
        p = None
        F.export_predictor(name)
        F.import_model(f"{name}.zip", f"{name}-new")
        p = Predictor(name=f'{name}-new')
        predictions = p.predict(when_data={'sqft': 1000},
                                use_gpu=use_gpu,
                                run_confidence_variation_analysis=True)
        assert_prediction_interface(predictions)
    def test_custom_backend(self):
        predictor = Predictor(name='custom_model_test_predictor')

        class CustomDTModel():
            def __init__(self):
                self.clf = LinearRegression()
                le = preprocessing.LabelEncoder()

            def set_transaction(self, transaction):
                self.transaction = transaction
                self.output_columns = self.transaction.lmd['predict_columns']
                self.input_columns = [
                    x for x in self.transaction.lmd['columns']
                    if x not in self.output_columns
                ]
                self.train_df = self.transaction.input_data.train_df
                self.test_dt = train_df = self.transaction.input_data.test_df

            def train(self):
                self.le_arr = {}
                for col in [*self.output_columns, *self.input_columns]:
                    self.le_arr[col] = preprocessing.LabelEncoder()
                    self.le_arr[col].fit(
                        pd.concat([
                            self.transaction.input_data.train_df,
                            self.transaction.input_data.test_df,
                            self.transaction.input_data.validation_df
                        ])[col])

                X = []
                for col in self.input_columns:
                    X.append(self.le_arr[col].transform(
                        self.transaction.input_data.train_df[col]))

                X = np.swapaxes(X, 1, 0)

                # Only works with one output column
                Y = self.le_arr[self.output_columns[0]].transform(
                    self.transaction.input_data.train_df[
                        self.output_columns[0]])

                self.clf.fit(X, Y)

            def predict(self, mode='predict', ignore_columns=[]):
                if mode == 'predict':
                    df = self.transaction.input_data.data_frame
                if mode == 'validate':
                    df = self.transaction.input_data.validation_df
                elif mode == 'test':
                    df = self.transaction.input_data.test_df

                X = []
                for col in self.input_columns:
                    X.append(self.le_arr[col].transform(df[col]))

                X = np.swapaxes(X, 1, 0)

                predictions = self.clf.predict(X)

                formated_predictions = {self.output_columns[0]: predictions}

                return formated_predictions

        dt_model = CustomDTModel()

        predictor.learn(
            to_predict='rental_price',
            from_data=
            "https://s3.eu-west-2.amazonaws.com/mindsdb-example-data/home_rentals.csv",
            backend=dt_model,
            use_gpu=False)
        predictions = predictor.predict(
            when_data=
            "https://s3.eu-west-2.amazonaws.com/mindsdb-example-data/home_rentals.csv",
            backend=dt_model)

        assert predictions