Beispiel #1
0
    def test_patsy_deviation_coding(self):
        df = pdml.ModelFrame({'X': [1, 2, 3, 4, 5], 'Y': [1, 3, 2, 2, 1],
                              'Z': [1, 1, 1, 2, 2]}, target='Z',
                             index=['a', 'b', 'c', 'd', 'e'])

        result = df.transform('C(X, Sum)')
        expected = pd.DataFrame({'Intercept': [1, 1, 1, 1, 1],
                                 'C(X, Sum)[S.1]': [1, 0, 0, 0, -1],
                                 'C(X, Sum)[S.2]': [0, 1, 0, 0, -1],
                                 'C(X, Sum)[S.3]': [0, 0, 1, 0, -1],
                                 'C(X, Sum)[S.4]': [0, 0, 0, 1, -1]},
                                index=['a', 'b', 'c', 'd', 'e'],
                                columns=['Intercept', 'C(X, Sum)[S.1]', 'C(X, Sum)[S.2]',
                                         'C(X, Sum)[S.3]', 'C(X, Sum)[S.4]'],
                                dtype=float)
        tm.assert_frame_equal(result, expected)

        result = df.transform('C(Y, Sum)')
        expected = pd.DataFrame({'Intercept': [1, 1, 1, 1, 1],
                                 'C(Y, Sum)[S.1]': [1, -1, 0, 0, 1],
                                 'C(Y, Sum)[S.2]': [0, -1, 1, 1, 0]},
                                index=['a', 'b', 'c', 'd', 'e'],
                                columns=['Intercept', 'C(Y, Sum)[S.1]', 'C(Y, Sum)[S.2]'],
                                dtype=float)
        tm.assert_frame_equal(result, expected)
Beispiel #2
0
    def test_patsy_matrices(self):
        df = pd.DataFrame({'A': [1, 2, 3],
                           'B': [4, 5, 6],
                           'C': [7, 8, 9]},
                          index=['a', 'b', 'c'],
                          columns=['A', 'B', 'C'])
        s = pd.Series([10, 11, 12], index=['a', 'b', 'c'])
        mdf = pdml.ModelFrame(df, target=s)

        result = mdf.transform('A ~ B + C')
        self.assertIsInstance(result, pdml.ModelFrame)
        self.assertEqual(result.shape, (3, 4))
        tm.assert_index_equal(result.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(result.columns, pd.Index(['A', 'Intercept', 'B', 'C']))
        expected = pd.DataFrame({'A': [1, 2, 3],
                                 'Intercept': [1, 1, 1],
                                 'B': [4, 5, 6],
                                 'C': [7, 8, 9]},
                                index=['a', 'b', 'c'],
                                columns=['A', 'Intercept', 'B', 'C'],
                                dtype=float)
        tm.assert_frame_equal(result, expected)
        expected = pd.Series([1, 2, 3], index=['a', 'b', 'c'], name='A', dtype=float)
        tm.assert_series_equal(result.target, expected)
        self.assertEqual(result.target.name, 'A')
        self.assertEqual(result.target_name, 'A')
Beispiel #3
0
    def test_frame_target_object_set(self):

        df = pd.DataFrame({datetime.datetime(2014, 1, 1): [1, 2, 3],
                           datetime.datetime(2015, 1, 1): [4, 5, 6],
                           datetime.datetime(2016, 1, 1): [7, 8, 9]},
                          index=['a', 'b', 'c'])
        mdf = pdml.ModelFrame(df)

        mdf.target = pd.Series(['A', 'B', 'C'], index=['a', 'b', 'c'], name=5)
        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 4))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        expected = pd.Index([5, datetime.datetime(2014, 1, 1),
                             datetime.datetime(2015, 1, 1), datetime.datetime(2016, 1, 1)])
        tm.assert_index_equal(mdf.columns, expected)
        tm.assert_frame_equal(mdf.data, df)
        expected = pd.Series(['A', 'B', 'C'], index=['a', 'b', 'c'], name=5)
        tm.assert_series_equal(mdf.target, expected)
        self.assertEqual(mdf.target.name, 5)

        # name will be ignored if ModelFrame already has a target
        mdf.target = pd.Series([10, 11, 12], index=['a', 'b', 'c'], name='X')
        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 4))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        expected = pd.Index([5,
                             datetime.datetime(2014, 1, 1),
                             datetime.datetime(2015, 1, 1),
                             datetime.datetime(2016, 1, 1)])
        tm.assert_index_equal(mdf.columns, expected)
        tm.assert_frame_equal(mdf.data, df)
        expected = pd.Series([10, 11, 12], index=['a', 'b', 'c'], name=5)
        tm.assert_series_equal(mdf.target, expected)
        self.assertEqual(mdf.target.name, 5)
Beispiel #4
0
    def test_multioutput(self):

        # http://scikit-learn.org/stable/auto_examples/ensemble/plot_random_forest_regression_multioutput.html#sphx-glr-auto-examples-ensemble-plot-random-forest-regression-multioutput-py

        from sklearn.multioutput import MultiOutputRegressor
        from sklearn.ensemble import RandomForestRegressor

        # Create a random dataset
        rng = np.random.RandomState(1)
        X = np.sort(200 * rng.rand(600, 1) - 100, axis=0)
        y = np.array([np.pi * np.sin(X).ravel(), np.pi * np.cos(X).ravel()]).T
        y += (0.5 - rng.rand(*y.shape))

        df = pdml.ModelFrame(X, target=y)

        max_depth = 30

        rf1 = df.ensemble.RandomForestRegressor(max_depth=max_depth,
                                                random_state=self.random_state)
        reg1 = df.multioutput.MultiOutputRegressor(rf1)

        rf2 = RandomForestRegressor(max_depth=max_depth,
                                    random_state=self.random_state)
        reg2 = MultiOutputRegressor(rf2)

        df.fit(reg1)
        reg2.fit(X, y)

        result = df.predict(reg2)
        expected = pd.DataFrame(reg2.predict(X))
        tm.assert_frame_equal(result, expected)
Beispiel #5
0
    def test_frame_init_df_series(self):
        # initialization by dataframe and no-named series
        df = pd.DataFrame({'A': [1, 2, 3],
                           'B': [4, 5, 6],
                           'C': [7, 8, 9]},
                          index=['a', 'b', 'c'],
                          columns=['A', 'B', 'C'])
        s = pd.Series([1, 2, 3], index=['a', 'b', 'c'], name='.target')

        mdf = pdml.ModelFrame(df, target=s)
        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 4))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['.target', 'A', 'B', 'C']))
        tm.assert_frame_equal(mdf.data, df)
        tm.assert_series_equal(mdf.target, s)
        self.assertEqual(mdf.target.name, '.target')
        self.assertEqual(mdf.target_name, '.target')

        s = pd.Series([1, 2, 3])
        with self.assertRaisesRegexp(ValueError, 'data and target must have equal index'):
            mdf = pdml.ModelFrame(df, target=s)

        s = pd.Series([1, 2, 3], index=['a', 'b', 'c'], name='XXX')
        mdf = pdml.ModelFrame(df, target=s)
        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 4))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['XXX', 'A', 'B', 'C']))
        tm.assert_frame_equal(mdf.data, df)
        tm.assert_series_equal(mdf.target, s)
        self.assertEqual(mdf.target.name, 'XXX')
        self.assertEqual(mdf.target_name, 'XXX')
Beispiel #6
0
    def test_multioutput(self):

        # http://scikit-learn.org/stable/auto_examples/ensemble/plot_random_forest_regression_multioutput.html#sphx-glr-auto-examples-ensemble-plot-random-forest-regression-multioutput-py

        from sklearn.multioutput import MultiOutputRegressor
        from sklearn.ensemble import RandomForestRegressor

        # Create a random dataset
        rng = np.random.RandomState(1)
        X = np.sort(200 * rng.rand(600, 1) - 100, axis=0)
        y = np.array([np.pi * np.sin(X).ravel(), np.pi * np.cos(X).ravel()]).T
        y += (0.5 - rng.rand(*y.shape))

        df = pdml.ModelFrame(X, target=y)

        max_depth = 30

        rf1 = df.ensemble.RandomForestRegressor(max_depth=max_depth,
                                                random_state=self.random_state)
        reg1 = df.multioutput.MultiOutputRegressor(rf1)

        rf2 = RandomForestRegressor(max_depth=max_depth,
                                    random_state=self.random_state)
        reg2 = MultiOutputRegressor(rf2)

        df.fit(reg1)
        reg2.fit(X, y)

        result = df.predict(reg2)
        expected = pd.DataFrame(reg2.predict(X))
        tm.assert_frame_equal(result, expected)
Beispiel #7
0
    def test_frame_data_proparty_series(self):
        df = pdml.ModelFrame({'A': [1, 2, 3],
                              'B': [4, 5, 6]},
                             target=[7, 8, 9],
                             index=['a', 'b', 'c'])
        df.data = df['A']
        exp = pdml.ModelFrame({'A': [1, 2, 3]},
                              target=[7, 8, 9],
                              index=['a', 'b', 'c'])
        tm.assert_frame_equal(df, exp)

        df = pdml.ModelFrame({'A': [1, 2, 3],
                              'B': [4, 5, 6]},
                             target=[7, 8, 9],
                             index=['a', 'b', 'c'])
        df.data = pd.Series([1, 2, 3], name='x', index=['a', 'b', 'c'])
        exp = pdml.ModelFrame({'x': [1, 2, 3]},
                              target=[7, 8, 9],
                              index=['a', 'b', 'c'])
        tm.assert_frame_equal(df, exp)

        df = pdml.ModelFrame({'A': [1, 2, 3],
                              'B': [4, 5, 6]},
                             target=[7, 8, 9],
                             index=['a', 'b', 'c'])
        with self.assertRaises(TypeError):
            df.data = [1, 2, 3]
Beispiel #8
0
    def test_frame_data_proparty_series(self):
        df = pdml.ModelFrame({
            'A': [1, 2, 3],
            'B': [4, 5, 6]
        },
                             target=[7, 8, 9],
                             index=['a', 'b', 'c'])
        df.data = df['A']
        exp = pdml.ModelFrame({'A': [1, 2, 3]},
                              target=[7, 8, 9],
                              index=['a', 'b', 'c'])
        tm.assert_frame_equal(df, exp)

        df = pdml.ModelFrame({
            'A': [1, 2, 3],
            'B': [4, 5, 6]
        },
                             target=[7, 8, 9],
                             index=['a', 'b', 'c'])
        df.data = pd.Series([1, 2, 3], name='x', index=['a', 'b', 'c'])
        exp = pdml.ModelFrame({'x': [1, 2, 3]},
                              target=[7, 8, 9],
                              index=['a', 'b', 'c'])
        tm.assert_frame_equal(df, exp)

        df = pdml.ModelFrame({
            'A': [1, 2, 3],
            'B': [4, 5, 6]
        },
                             target=[7, 8, 9],
                             index=['a', 'b', 'c'])
        with pytest.raises(TypeError):
            df.data = [1, 2, 3]
Beispiel #9
0
    def test_frame_init_df_duplicated(self):
        # initialization by dataframe and duplicated target
        df = pd.DataFrame({
            'A': [1, 2, 3],
            'B': [4, 5, 6],
            'C': [7, 8, 9]
        },
                          columns=['A', 'B', 'C'])
        s = pd.Series([10, 11, 12], name='A')

        msg = "data and target must have unique names"
        with pytest.raises(ValueError, match=msg):
            pdml.ModelFrame(df, target=s)

        df = pdml.ModelFrame({
            'A': [1, 2, 3],
            'B': [4, 5, 6],
            'C': [7, 8, 9]
        },
                             columns=['A', 'B', 'C'])
        df.target = pd.Series([10, 11, 12], name='A')
        expected = pdml.ModelFrame(
            {
                'A': [10, 11, 12],
                'B': [4, 5, 6],
                'C': [7, 8, 9]
            },
            columns=['A', 'B', 'C'])
        tm.assert_frame_equal(df, expected)
    def test_LabelEncoder_frame(self):
        arr = np.array(['X', 'Y', 'Z', 'X'])
        df = pdml.ModelFrame(arr, index=['a', 'b', 'c', 'd'], columns=['A'])

        mod1 = df.pp.LabelEncoder()
        df.fit(mod1)
        result = df.transform(mod1)

        expected = np.array([0, 1, 2, 0]).reshape(-1, 1)

        self.assertIsInstance(result, pdml.ModelFrame)
        self.assert_numpy_array_almost_equal(result.values, expected)
        tm.assert_index_equal(result.columns, df.columns)
        tm.assert_index_equal(result.index, df.index)

        mod1 = df.pp.LabelEncoder()
        result = df.fit_transform(mod1)

        self.assertIsInstance(result, pdml.ModelFrame)
        self.assert_numpy_array_almost_equal(result.values, expected)
        tm.assert_index_equal(result.columns, df.columns)
        tm.assert_index_equal(result.index, df.index)

        inversed = result.inverse_transform(mod1)
        self.assertIsInstance(inversed, pdml.ModelFrame)
        tm.assert_frame_equal(inversed, df)
Beispiel #11
0
    def test_LabelEncoder_frame(self):
        arr = np.array(['X', 'Y', 'Z', 'X'])
        df = pdml.ModelFrame(arr, index=['a', 'b', 'c', 'd'], columns=['A'])

        mod1 = df.pp.LabelEncoder()
        df.fit(mod1)
        result = df.transform(mod1)

        expected = np.array([0, 1, 2, 0]).reshape(-1, 1)

        self.assertIsInstance(result, pdml.ModelFrame)
        self.assert_numpy_array_almost_equal(result.values, expected)
        tm.assert_index_equal(result.columns, df.columns)
        tm.assert_index_equal(result.index, df.index)

        mod1 = df.pp.LabelEncoder()
        result = df.fit_transform(mod1)

        self.assertIsInstance(result, pdml.ModelFrame)
        self.assert_numpy_array_almost_equal(result.values, expected)
        tm.assert_index_equal(result.columns, df.columns)
        tm.assert_index_equal(result.index, df.index)

        inversed = result.inverse_transform(mod1)
        self.assertIsInstance(inversed, pdml.ModelFrame)
        tm.assert_frame_equal(inversed, df)
    def test_FunctionTransformer(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        mod1 = df.pp.FunctionTransformer(func=lambda x: x + 1)
        df.fit(mod1)
        result = df.transform(mod1)

        exp = df.copy()
        exp.data = exp.data + 1

        self.assertIsInstance(result, pdml.ModelFrame)
        tm.assert_frame_equal(result, exp)
Beispiel #13
0
    def test_transform_standard(self):
        # check pandas standard transform works

        df = pd.DataFrame({
            'A': ['A', 'B', 'A', 'A', 'A', 'B', 'B', 'B'],
            'B': np.random.randn(8),
            'C': np.random.randn(8)
        })

        mdf = pdml.ModelFrame(df)
        tm.assert_frame_equal(
            df.groupby('A').transform('mean'),
            mdf.groupby('A').transform('mean'))
Beispiel #14
0
    def test_grid_search(self):
        tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                             'C': [1, 10, 100]},
                            {'kernel': ['linear'], 'C': [1, 10, 100]}]

        df = pdml.ModelFrame(datasets.load_digits())
        cv = df.model_selection.GridSearchCV(df.svm.SVC(C=1), tuned_parameters, cv=5)

        with tm.RNGContext(1):
            df.fit(cv)

        result = df.model_selection.describe(cv)
        expected = pd.DataFrame(cv.cv_results_)
        self.assertIsInstance(result, pdml.ModelFrame)
        tm.assert_frame_equal(result, expected)
Beispiel #15
0
    def test_grid_search(self):
        tuned_parameters = [{'max_depth': [3, 4], 'n_estimators': [50, 100]}]

        df = pdml.ModelFrame(datasets.load_digits())
        cv = df.model_selection.GridSearchCV(df.xgb.XGBClassifier(),
                                             tuned_parameters,
                                             cv=5)

        with tm.RNGContext(1):
            df.fit(cv)

        result = df.model_selection.describe(cv)
        expected = pd.DataFrame(cv.cv_results_)
        self.assertIsInstance(result, pdml.ModelFrame)
        tm.assert_frame_equal(result, expected)
Beispiel #16
0
    def test_frame_init_dict_list_series_index(self):
        # initialization by dataframe and list
        df = {'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}
        target = pd.Series([9, 8, 7], name='X', index=['a', 'b', 'c'])
        mdf = pdml.ModelFrame(df, target=target)

        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 4))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['X', 'A', 'B', 'C']))
        expected = pd.DataFrame(df, index=['a', 'b', 'c'])
        tm.assert_frame_equal(mdf.data, expected)
        tm.assert_series_equal(mdf.target, target)
        self.assertEqual(mdf.target.name, 'X')
        self.assertEqual(mdf.target_name, 'X')
Beispiel #17
0
    def test_frame_init_dict_list_series_index(self):
        # initialization by dataframe and list
        df = {'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}
        target = pd.Series([9, 8, 7], name='X', index=['a', 'b', 'c'])
        mdf = pdml.ModelFrame(df, target=target)

        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 4))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['X', 'A', 'B', 'C']))
        expected = pd.DataFrame(df, index=['a', 'b', 'c'])
        tm.assert_frame_equal(mdf.data, expected)
        tm.assert_series_equal(mdf.target, target)
        self.assertEqual(mdf.target.name, 'X')
        self.assertEqual(mdf.target_name, 'X')
Beispiel #18
0
    def test_frame_init_dict_list(self):
        # initialization by dataframe and list
        df = pd.DataFrame({'A': [1, 2, 3],
                           'B': [4, 5, 6],
                           'C': [7, 8, 9]},
                          index=['a', 'b', 'c'],
                          columns=['A', 'B', 'C'])
        s = [1, 2, 3]
        mdf = pdml.ModelFrame(df, target=s)
        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 4))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['.target', 'A', 'B', 'C']))
        tm.assert_frame_equal(mdf.data, df)
        expected = pd.Series([1, 2, 3], index=['a', 'b', 'c'], name='.target')
        tm.assert_series_equal(mdf.target, expected)
        self.assertEqual(mdf.target.name, '.target')
        self.assertEqual(mdf.target_name, '.target')

        df = {'A': [1, 2, 3],
              'B': [4, 5, 6],
              'C': [7, 8, 9]}
        s = [1, 2, 3]
        mdf = pdml.ModelFrame(df, target=s)
        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 4))
        tm.assert_index_equal(mdf.index, pd.Index([0, 1, 2]))
        tm.assert_index_equal(mdf.columns, pd.Index(['.target', 'A', 'B', 'C']))
        expected = pd.DataFrame(df)
        tm.assert_frame_equal(mdf.data, expected)
        expected = pd.Series([1, 2, 3], index=[0, 1, 2], name='.target')
        tm.assert_series_equal(mdf.target, expected)
        self.assertEqual(mdf.target.name, '.target')
        self.assertEqual(mdf.target_name, '.target')

        mdf = pdml.ModelFrame(df, target='A')
        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 3))
        tm.assert_index_equal(mdf.index, pd.Index([0, 1, 2]))
        tm.assert_index_equal(mdf.columns, pd.Index(['A', 'B', 'C']))
        expected = pd.DataFrame(df)
        tm.assert_frame_equal(mdf.data, expected[['B', 'C']])
        tm.assert_series_equal(mdf.target, expected['A'])
        self.assertEqual(mdf.target.name, 'A')
        self.assertEqual(mdf.target_name, 'A')

        mdf = pdml.ModelFrame({'A': [1, 2, 3],
                               'B': [4, 5, 6],
                               'C': [7, 8, 9]},
                              index=['a', 'b', 'c'],
                              columns=['A', 'B', 'C'])
        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 3))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['A', 'B', 'C']))
        tm.assert_frame_equal(mdf.data, mdf)
        self.assertEqual(mdf.target_name, '.target')
Beispiel #19
0
    def test_frame_init_df_none(self):
        # initialization by dataframe and none
        df = pd.DataFrame({'A': [1, 2, 3],
                           'B': [4, 5, 6],
                           'C': [7, 8, 9]},
                          index=['a', 'b', 'c'],
                          columns=['A', 'B', 'C'])

        mdf = pdml.ModelFrame(df, target=None)
        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 3))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['A', 'B', 'C']))
        tm.assert_frame_equal(mdf.data, df)
        self.assertTrue(mdf.has_data())
        self.assertTrue(mdf.target is None)
        self.assertEqual(mdf.target_name, '.target')
Beispiel #20
0
    def test_FunctionTransformer(self):
        if not pdml.compat._SKLEARN_ge_017:
            import nose
            raise nose.SkipTest()

        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        mod1 = df.pp.FunctionTransformer(func=lambda x: x + 1)
        df.fit(mod1)
        result = df.transform(mod1)

        exp = df.copy()
        exp.data = exp.data + 1

        self.assertIsInstance(result, pdml.ModelFrame)
        tm.assert_frame_equal(result, exp)
Beispiel #21
0
    def test_frame_init_df_df(self):
        # initialization by dataframe and dataframe
        df = pd.DataFrame({
            'A': [1, 2, 3],
            'B': [4, 5, 6],
            'C': [7, 8, 9]
        },
                          index=['a', 'b', 'c'],
                          columns=['A', 'B', 'C'])
        target = pd.DataFrame({
            't1': [10, 11, 12],
            't2': [13, 14, 15]
        },
                              index=['a', 'b', 'c'])
        mdf = pdml.ModelFrame(df, target=target)

        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 5))
        expected = pd.DataFrame(
            {
                't1': [10, 11, 12],
                't2': [13, 14, 15],
                'A': [1, 2, 3],
                'B': [4, 5, 6],
                'C': [7, 8, 9]
            },
            index=['a', 'b', 'c'],
            columns=['t1', 't2', 'A', 'B', 'C'])
        tm.assert_frame_equal(mdf, expected)
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns,
                              pd.Index(['t1', 't2', 'A', 'B', 'C']))
        tm.assert_frame_equal(mdf.data, df)
        tm.assert_frame_equal(mdf.target, target)
        tm.assert_index_equal(mdf.target.columns, pd.Index(['t1', 't2']))
        tm.assert_index_equal(mdf.target_name, pd.Index(['t1', 't2']))
        self.assertTrue(mdf.has_multi_targets())

        target = pd.DataFrame({'t1': [10, 11, 12], 't2': [13, 14, 15]})
        msg = 'data and target must have equal index'
        with pytest.raises(ValueError, match=msg):
            mdf = pdml.ModelFrame(df, target=target)

        # single column DataFrame will results in single target column
        target = pd.DataFrame({'t1': [10, 11, 12]}, index=['a', 'b', 'c'])
        mdf = pdml.ModelFrame(df, target=target)
        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 4))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['t1', 'A', 'B', 'C']))
        tm.assert_frame_equal(mdf.data, df)

        target = pd.Series([10, 11, 12], name='t1', index=['a', 'b', 'c'])
        tm.assert_series_equal(mdf.target, target)
        self.assertEqual(mdf.target_name, 't1')
Beispiel #22
0
    def test_frame_init_df_array_series(self):
        s = pd.Series([1, 2, 3], index=['a', 'b', 'c'], name='.target')
        mdf = pdml.ModelFrame(np.array([[1, 2, 3], [4, 5, 6],
                                        [7, 8, 9]]), target=s,
                              index=['a', 'b', 'c'], columns=['A', 'B', 'C'])

        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 4))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['.target', 'A', 'B', 'C']))

        expected = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6],
                                          [7, 8, 9]]),
                                index=['a', 'b', 'c'], columns=['A', 'B', 'C'])
        tm.assert_frame_equal(mdf.data, expected)
        tm.assert_series_equal(mdf.target, s)
        self.assertEqual(mdf.target.name, '.target')
        self.assertEqual(mdf.target_name, '.target')
Beispiel #23
0
    def test_frame_target_object(self):
        df = pd.DataFrame({datetime.datetime(2014, 1, 1): [1, 2, 3],
                           datetime.datetime(2015, 1, 1): [4, 5, 6],
                           datetime.datetime(2016, 1, 1): [7, 8, 9]},
                          index=['a', 'b', 'c'])
        mdf = pdml.ModelFrame(df, target=datetime.datetime(2016, 1, 1))

        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 3))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        expected = pd.DatetimeIndex(['2014-01-01', '2015-01-01', '2016-01-01'])
        tm.assert_index_equal(mdf.columns, expected)
        tm.assert_frame_equal(mdf.data, df.iloc[:, :2])
        expected = pd.Series([7, 8, 9], index=['a', 'b', 'c'],
                             name=pd.Timestamp('2016-01-01'))
        tm.assert_series_equal(mdf.target, expected)
        self.assertEqual(mdf.target.name, datetime.datetime(2016, 1, 1))
        self.assertEqual(mdf.target_name, datetime.datetime(2016, 1, 1))
Beispiel #24
0
    def test_grid_search(self):
        tuned_parameters = [{'max_depth': [3, 4],
                             'n_estimators': [50, 100]}]

        df = pdml.ModelFrame(datasets.load_digits())
        cv = df.grid_search.GridSearchCV(df.xgb.XGBClassifier(), tuned_parameters, cv=5)

        with tm.RNGContext(1):
            df.fit(cv)

        result = df.grid_search.describe(cv)
        expected = pd.DataFrame({'mean': [0.89705064, 0.91764051, 0.91263216, 0.91930996],
                                 'std': [0.03244061, 0.03259985, 0.02764891, 0.0266436],
                                 'max_depth': [3, 3, 4, 4],
                                 'n_estimators': [50, 100, 50, 100]},
                                columns=['mean', 'std', 'max_depth', 'n_estimators'])
        self.assertIsInstance(result, pdml.ModelFrame)
        tm.assert_frame_equal(result, expected)
Beispiel #25
0
    def test_frame_init_df_none(self):
        # initialization by dataframe and none
        df = pd.DataFrame({
            'A': [1, 2, 3],
            'B': [4, 5, 6],
            'C': [7, 8, 9]
        },
                          index=['a', 'b', 'c'],
                          columns=['A', 'B', 'C'])

        mdf = pdml.ModelFrame(df, target=None)
        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 3))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['A', 'B', 'C']))
        tm.assert_frame_equal(mdf.data, df)
        self.assertTrue(mdf.has_data())
        self.assertTrue(mdf.target is None)
        self.assertEqual(mdf.target_name, '.target')
Beispiel #26
0
    def test_frame_target_object_set(self):

        df = pd.DataFrame(
            {
                datetime.datetime(2014, 1, 1): [1, 2, 3],
                datetime.datetime(2015, 1, 1): [4, 5, 6],
                datetime.datetime(2016, 1, 1): [7, 8, 9]
            },
            index=['a', 'b', 'c'])
        mdf = pdml.ModelFrame(df)

        mdf.target = pd.Series(['A', 'B', 'C'], index=['a', 'b', 'c'], name=5)
        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 4))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        expected = pd.Index([
            5,
            datetime.datetime(2014, 1, 1),
            datetime.datetime(2015, 1, 1),
            datetime.datetime(2016, 1, 1)
        ])
        tm.assert_index_equal(mdf.columns, expected)
        tm.assert_frame_equal(mdf.data, df)
        expected = pd.Series(['A', 'B', 'C'], index=['a', 'b', 'c'], name=5)
        tm.assert_series_equal(mdf.target, expected)
        self.assertEqual(mdf.target.name, 5)

        # name will be ignored if ModelFrame already has a target
        mdf.target = pd.Series([10, 11, 12], index=['a', 'b', 'c'], name='X')
        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 4))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        expected = pd.Index([
            5,
            datetime.datetime(2014, 1, 1),
            datetime.datetime(2015, 1, 1),
            datetime.datetime(2016, 1, 1)
        ])
        tm.assert_index_equal(mdf.columns, expected)
        tm.assert_frame_equal(mdf.data, df)
        expected = pd.Series([10, 11, 12], index=['a', 'b', 'c'], name=5)
        tm.assert_series_equal(mdf.target, expected)
        self.assertEqual(mdf.target.name, 5)
Beispiel #27
0
    def test_frame_init_df_array_series(self):
        s = pd.Series([1, 2, 3], index=['a', 'b', 'c'], name='.target')
        mdf = pdml.ModelFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
                              target=s,
                              index=['a', 'b', 'c'],
                              columns=['A', 'B', 'C'])

        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 4))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['.target', 'A', 'B',
                                                     'C']))

        expected = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
                                index=['a', 'b', 'c'],
                                columns=['A', 'B', 'C'])
        tm.assert_frame_equal(mdf.data, expected)
        tm.assert_series_equal(mdf.target, s)
        self.assertEqual(mdf.target.name, '.target')
        self.assertEqual(mdf.target_name, '.target')
Beispiel #28
0
    def test_frame_init_df_duplicated_columns(self):
        # initialization by dataframe and dataframe which have same columns
        df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})
        target = pd.DataFrame({'A': [10, 11, 12], 'B': [13, 14, 15]})

        mdf = pdml.ModelFrame(df, target=target)

        cols = pd.MultiIndex.from_tuples([('.target', 'A'), ('.target', 'B'),
                                          ('.data', 'A'), ('.data', 'B'),
                                          ('.data', 'C')])
        expected = pd.DataFrame(
            {
                ('.target', 'A'): [10, 11, 12],
                ('.target', 'B'): [13, 14, 15],
                ('.data', 'A'): [1, 2, 3],
                ('.data', 'B'): [4, 5, 6],
                ('.data', 'C'): [7, 8, 9]
            },
            columns=cols)
        tm.assert_frame_equal(mdf, expected)
Beispiel #29
0
    def test_frame_init_df_str(self):
        # initialization by dataframe and str
        df = pd.DataFrame({'A': [1, 2, 3],
                           'B': [4, 5, 6],
                           'C': [7, 8, 9]},
                          index=['a', 'b', 'c'],
                          columns=['A', 'B', 'C'])

        mdf = pdml.ModelFrame(df, target='A')
        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 3))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['A', 'B', 'C']))
        tm.assert_frame_equal(mdf.data, df[['B', 'C']])
        tm.assert_series_equal(mdf.target, df['A'])
        self.assertEqual(mdf.target.name, 'A')
        self.assertEqual(mdf.target_name, 'A')

        msg = "Specified target 'X' is not included in data"
        with self.assertRaisesRegexp(ValueError, msg):
            mdf = pdml.ModelFrame(df, target='X')
    def test_LabelBinarizer2(self):
        arr = np.array(['X', 'Y', 'Z', 'X'])
        s = pdml.ModelSeries(arr)

        lb = s.preprocessing.LabelBinarizer()
        s.fit(lb)

        binarized = s.transform(lb)
        self.assertIsInstance(binarized, pdml.ModelFrame)

        expected = pd.DataFrame({
            0: [1, 0, 0, 1],
            1: [0, 1, 0, 0],
            2: [0, 0, 1, 0]
        })
        tm.assert_frame_equal(binarized, expected)

        df = pdml.ModelFrame(datasets.load_iris())
        df.target.fit(lb)
        binarized = df.target.transform(lb)

        expected = pd.DataFrame({
            0: [1] * 50 + [0] * 100,
            1: [0] * 50 + [1] * 50 + [0] * 50,
            2: [0] * 100 + [1] * 50
        })
        tm.assert_frame_equal(binarized, expected)

        df = pdml.ModelFrame(datasets.load_iris())
        df.target.fit(lb)
        df.target = df.target.transform(lb)
        self.assertEqual(df.shape, (150, 7))
        tm.assert_frame_equal(df.target, expected)
Beispiel #31
0
    def test_LabelBinarizer2(self):
        arr = np.array(['X', 'Y', 'Z', 'X'])
        s = pdml.ModelSeries(arr)

        lb = s.preprocessing.LabelBinarizer()
        s.fit(lb)

        binarized = s.transform(lb)
        self.assertIsInstance(binarized, pdml.ModelFrame)

        expected = pd.DataFrame({0: [1, 0, 0, 1], 1: [0, 1, 0, 0], 2: [0, 0, 1, 0]})
        tm.assert_frame_equal(binarized, expected)

        df = pdml.ModelFrame(datasets.load_iris())
        df.target.fit(lb)
        binarized = df.target.transform(lb)

        expected = pd.DataFrame({0: [1] * 50 + [0] * 100,
                                 1: [0] * 50 + [1] * 50 + [0] * 50,
                                 2: [0] * 100 + [1] * 50})
        tm.assert_frame_equal(binarized, expected)

        df = pdml.ModelFrame(datasets.load_iris())
        df.target.fit(lb)
        df.target = df.target.transform(lb)
        self.assertEqual(df.shape, (150, 7))
        tm.assert_frame_equal(df.target, expected)
Beispiel #32
0
    def test_frame_init_df_duplicated(self):
        # initialization by dataframe and duplicated target
        df = pd.DataFrame({'A': [1, 2, 3],
                           'B': [4, 5, 6],
                           'C': [7, 8, 9]},
                          columns=['A', 'B', 'C'])
        s = pd.Series([10, 11, 12], name='A')

        msg = "data and target must have unique names"
        with self.assertRaisesRegexp(ValueError, msg):
            pdml.ModelFrame(df, target=s)

        df = pdml.ModelFrame({'A': [1, 2, 3],
                              'B': [4, 5, 6],
                              'C': [7, 8, 9]},
                             columns=['A', 'B', 'C'])
        df.target = pd.Series([10, 11, 12], name='A')
        expected = pdml.ModelFrame({'A': [10, 11, 12],
                                    'B': [4, 5, 6],
                                    'C': [7, 8, 9]},
                                   columns=['A', 'B', 'C'])
        tm.assert_frame_equal(df, expected)
Beispiel #33
0
    def test_frame_init_df_duplicated_columns(self):
        # initialization by dataframe and dataframe which have same columns
        df = pd.DataFrame({'A': [1, 2, 3],
                           'B': [4, 5, 6],
                           'C': [7, 8, 9]})
        target = pd.DataFrame({'A': [10, 11, 12],
                               'B': [13, 14, 15]})

        mdf = pdml.ModelFrame(df, target=target)

        cols = pd.MultiIndex.from_tuples([('.target', 'A'),
                                          ('.target', 'B'),
                                          ('.data', 'A'),
                                          ('.data', 'B'),
                                          ('.data', 'C')])
        expected = pd.DataFrame({('.target', 'A'): [10, 11, 12],
                                 ('.target', 'B'): [13, 14, 15],
                                 ('.data', 'A'): [1, 2, 3],
                                 ('.data', 'B'): [4, 5, 6],
                                 ('.data', 'C'): [7, 8, 9]},
                                columns=cols)
        tm.assert_frame_equal(mdf, expected)
Beispiel #34
0
    def test_grid_search(self):
        tuned_parameters = [{
            'kernel': ['rbf'],
            'gamma': [1e-3, 1e-4],
            'C': [1, 10, 100]
        }, {
            'kernel': ['linear'],
            'C': [1, 10, 100]
        }]

        df = pdml.ModelFrame(datasets.load_digits())
        cv = df.model_selection.GridSearchCV(df.svm.SVC(C=1),
                                             tuned_parameters,
                                             cv=5)

        with tm.RNGContext(1):
            df.fit(cv)

        result = df.model_selection.describe(cv)
        expected = pd.DataFrame(
            {
                'mean': [
                    0.97161937, 0.9476906, 0.97273233, 0.95937674, 0.97273233,
                    0.96271564, 0.94936004, 0.94936004, 0.94936004
                ],
                'std': [
                    0.01546977, 0.0221161, 0.01406514, 0.02295168, 0.01406514,
                    0.01779749, 0.01911084, 0.01911084, 0.01911084
                ],
                'C': [1, 1, 10, 10, 100, 100, 1, 10, 100],
                'gamma': [
                    0.001, 0.0001, 0.001, 0.0001, 0.001, 0.0001, np.nan,
                    np.nan, np.nan
                ],
                'kernel': ['rbf'] * 6 + ['linear'] * 3
            },
            columns=['mean', 'std', 'C', 'gamma', 'kernel'])
        self.assertIsInstance(result, pdml.ModelFrame)
        tm.assert_frame_equal(result, expected)
Beispiel #35
0
    def test_grid_search(self):
        tuned_parameters = [{'max_depth': [3, 4], 'n_estimators': [50, 100]}]

        df = pdml.ModelFrame(datasets.load_digits())
        cv = df.grid_search.GridSearchCV(df.xgb.XGBClassifier(),
                                         tuned_parameters,
                                         cv=5)

        with tm.RNGContext(1):
            df.fit(cv)

        result = df.grid_search.describe(cv)
        expected = pd.DataFrame(
            {
                'mean': [0.89705064, 0.91764051, 0.91263216, 0.91930996],
                'std': [0.03244061, 0.03259985, 0.02764891, 0.0266436],
                'max_depth': [3, 3, 4, 4],
                'n_estimators': [50, 100, 50, 100]
            },
            columns=['mean', 'std', 'max_depth', 'n_estimators'])
        self.assertIsInstance(result, pdml.ModelFrame)
        tm.assert_frame_equal(result, expected)
Beispiel #36
0
    def test_frame_target_object(self):
        df = pd.DataFrame(
            {
                datetime.datetime(2014, 1, 1): [1, 2, 3],
                datetime.datetime(2015, 1, 1): [4, 5, 6],
                datetime.datetime(2016, 1, 1): [7, 8, 9]
            },
            index=['a', 'b', 'c'])
        mdf = pdml.ModelFrame(df, target=datetime.datetime(2016, 1, 1))

        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 3))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        expected = pd.DatetimeIndex(['2014-01-01', '2015-01-01', '2016-01-01'])
        tm.assert_index_equal(mdf.columns, expected)
        tm.assert_frame_equal(mdf.data, df.iloc[:, :2])
        expected = pd.Series([7, 8, 9],
                             index=['a', 'b', 'c'],
                             name=pd.Timestamp('2016-01-01'))
        tm.assert_series_equal(mdf.target, expected)
        self.assertEqual(mdf.target.name, datetime.datetime(2016, 1, 1))
        self.assertEqual(mdf.target_name, datetime.datetime(2016, 1, 1))
Beispiel #37
0
    def test_frame_init_df_df(self):
        # initialization by dataframe and dataframe
        df = pd.DataFrame({'A': [1, 2, 3],
                           'B': [4, 5, 6],
                           'C': [7, 8, 9]},
                          index=['a', 'b', 'c'],
                          columns=['A', 'B', 'C'])
        target = pd.DataFrame({'t1': [10, 11, 12],
                               't2': [13, 14, 15]},
                              index=['a', 'b', 'c'])
        mdf = pdml.ModelFrame(df, target=target)

        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 5))
        expected = pd.DataFrame({'t1': [10, 11, 12],
                                 't2': [13, 14, 15],
                                 'A': [1, 2, 3],
                                 'B': [4, 5, 6],
                                 'C': [7, 8, 9]},
                                index=['a', 'b', 'c'],
                                columns=['t1', 't2', 'A', 'B', 'C'])
        tm.assert_frame_equal(mdf, expected)
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns,
                              pd.Index(['t1', 't2', 'A', 'B', 'C']))
        tm.assert_frame_equal(mdf.data, df)
        tm.assert_frame_equal(mdf.target, target)
        tm.assert_index_equal(mdf.target.columns, pd.Index(['t1', 't2']))
        tm.assert_index_equal(mdf.target_name, pd.Index(['t1', 't2']))
        self.assertTrue(mdf.has_multi_targets())

        target = pd.DataFrame({'t1': [10, 11, 12], 't2': [13, 14, 15]})
        msg = 'data and target must have equal index'
        with self.assertRaisesRegexp(ValueError, msg):
            mdf = pdml.ModelFrame(df, target=target)

        # single column DataFrame will results in single target column
        target = pd.DataFrame({'t1': [10, 11, 12]}, index=['a', 'b', 'c'])
        mdf = pdml.ModelFrame(df, target=target)
        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 4))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['t1', 'A', 'B', 'C']))
        tm.assert_frame_equal(mdf.data, df)

        target = pd.Series([10, 11, 12], name='t1', index=['a', 'b', 'c'])
        tm.assert_series_equal(mdf.target, target)
        self.assertEqual(mdf.target_name, 't1')
Beispiel #38
0
    def test_frame_init_df_str(self):
        # initialization by dataframe and str
        df = pd.DataFrame({
            'A': [1, 2, 3],
            'B': [4, 5, 6],
            'C': [7, 8, 9]
        },
                          index=['a', 'b', 'c'],
                          columns=['A', 'B', 'C'])

        mdf = pdml.ModelFrame(df, target='A')
        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 3))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['A', 'B', 'C']))
        tm.assert_frame_equal(mdf.data, df[['B', 'C']])
        tm.assert_series_equal(mdf.target, df['A'])
        self.assertEqual(mdf.target.name, 'A')
        self.assertEqual(mdf.target_name, 'A')

        msg = "Specified target 'X' is not included in data"
        with pytest.raises(ValueError, match=msg):
            mdf = pdml.ModelFrame(df, target='X')
    def test_grid_search(self):
        tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                             'C': [1, 10, 100]},
                            {'kernel': ['linear'], 'C': [1, 10, 100]}]

        df = pdml.ModelFrame(datasets.load_digits())
        cv = df.model_selection.GridSearchCV(df.svm.SVC(C=1), tuned_parameters, cv=5)

        with tm.RNGContext(1):
            df.fit(cv)

        result = df.model_selection.describe(cv)
        expected = pd.DataFrame({'mean': [0.97161937, 0.9476906, 0.97273233, 0.95937674, 0.97273233,
                                          0.96271564, 0.94936004, 0.94936004, 0.94936004],
                                 'std': [0.01546977, 0.0221161, 0.01406514, 0.02295168, 0.01406514,
                                         0.01779749, 0.01911084, 0.01911084, 0.01911084],
                                 'C': [1, 1, 10, 10, 100, 100, 1, 10, 100],
                                 'gamma': [0.001, 0.0001, 0.001, 0.0001, 0.001, 0.0001,
                                           np.nan, np.nan, np.nan],
                                 'kernel': ['rbf'] * 6 + ['linear'] * 3},
                                columns=['mean', 'std', 'C', 'gamma', 'kernel'])
        self.assertIsInstance(result, pdml.ModelFrame)
        tm.assert_frame_equal(result, expected)
Beispiel #40
0
    def test_frame_init_df_series(self):
        # initialization by dataframe and no-named series
        df = pd.DataFrame({
            'A': [1, 2, 3],
            'B': [4, 5, 6],
            'C': [7, 8, 9]
        },
                          index=['a', 'b', 'c'],
                          columns=['A', 'B', 'C'])
        s = pd.Series([1, 2, 3], index=['a', 'b', 'c'], name='.target')

        mdf = pdml.ModelFrame(df, target=s)
        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 4))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['.target', 'A', 'B',
                                                     'C']))
        tm.assert_frame_equal(mdf.data, df)
        tm.assert_series_equal(mdf.target, s)
        self.assertEqual(mdf.target.name, '.target')
        self.assertEqual(mdf.target_name, '.target')

        s = pd.Series([1, 2, 3])
        with pytest.raises(ValueError,
                           match='data and target must have equal index'):
            mdf = pdml.ModelFrame(df, target=s)

        s = pd.Series([1, 2, 3], index=['a', 'b', 'c'], name='XXX')
        mdf = pdml.ModelFrame(df, target=s)
        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 4))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['XXX', 'A', 'B', 'C']))
        tm.assert_frame_equal(mdf.data, df)
        tm.assert_series_equal(mdf.target, s)
        self.assertEqual(mdf.target.name, 'XXX')
        self.assertEqual(mdf.target_name, 'XXX')
Beispiel #41
0
    def test_train_test_split_keep_index(self):
        df = pdml.ModelFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8],
                              'B': [1, 2, 3, 4, 5, 6, 7, 8]},
                             index='a b c d e f g h'.split(' '))
        tr, te = df.ms.train_test_split(random_state=self.random_state)
        tm.assert_frame_equal(tr, df.loc[['g', 'a', 'e', 'f', 'd', 'h']])
        tm.assert_frame_equal(te, df.loc[['c', 'b']])

        tr, te = df.ms.train_test_split(random_state=self.random_state, reset_index=True)
        tm.assert_frame_equal(tr, df.loc[['g', 'a', 'e', 'f', 'd', 'h']].reset_index(drop=True))
        tm.assert_frame_equal(te, df.loc[['c', 'b']].reset_index(drop=True))

        df = pdml.ModelFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8],
                              'B': [1, 2, 3, 4, 5, 6, 7, 8]},
                             index='a b c d e f g h'.split(' '),
                             target=[1, 2, 3, 4, 5, 6, 7, 8])
        tr, te = df.ms.train_test_split(random_state=self.random_state)
        tm.assert_frame_equal(tr, df.loc[['g', 'a', 'e', 'f', 'd', 'h']])
        tm.assert_numpy_array_equal(tr.target.values, np.array([7, 1, 5, 6, 4, 8]))
        tm.assert_frame_equal(te, df.loc[['c', 'b']])
        tm.assert_numpy_array_equal(te.target.values, np.array([3, 2]))

        tr, te = df.ms.train_test_split(random_state=self.random_state, reset_index=True)
        tm.assert_frame_equal(tr, df.loc[['g', 'a', 'e', 'f', 'd', 'h']].reset_index(drop=True))
        tm.assert_numpy_array_equal(tr.target.values, np.array([7, 1, 5, 6, 4, 8]))
        tm.assert_frame_equal(te, df.loc[['c', 'b']].reset_index(drop=True))
        tm.assert_numpy_array_equal(te.target.values, np.array([3, 2]))
Beispiel #42
0
    def test_frame_data_proparty(self):
        df = pd.DataFrame({'A': [1, 2, 3],
                           'B': [4, 5, 6],
                           'C': [7, 8, 9]},
                          index=['a', 'b', 'c'],
                          columns=['A', 'B', 'C'])
        s = pd.Series([1, 2, 3], index=['a', 'b', 'c'], name='.target')

        mdf = pdml.ModelFrame(df, target=s)
        self.assertIsInstance(mdf, pdml.ModelFrame)

        new = pd.DataFrame({'X': [1, 2, 3],
                            'Y': [4, 5, 6]},
                           index=['a', 'b', 'c'],
                           columns=['X', 'Y'])
        # set data property
        mdf.data = new

        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 3))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['.target', 'X', 'Y']))
        tm.assert_frame_equal(mdf.data, new)
        tm.assert_series_equal(mdf.target, s)
        self.assertEqual(mdf.target.name, '.target')
        self.assertEqual(mdf.target_name, '.target')

        new = pdml.ModelFrame({'M': [1, 2, 3],
                               'N': [4, 5, 6]},
                              index=['a', 'b', 'c'],
                              columns=['M', 'N'])

        # set data property
        mdf.data = new

        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 3))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['.target', 'M', 'N']))
        tm.assert_frame_equal(mdf.data, new)
        tm.assert_series_equal(mdf.target, s)
        self.assertEqual(mdf.target.name, '.target')
        self.assertEqual(mdf.target_name, '.target')

        new = pd.DataFrame({'.target': [1, 2, 3],
                            'K': [4, 5, 6]},
                           index=['a', 'b', 'c'])

        # unable to set data if passed value has the same column as the target
        msg = "Passed data has the same column name as the target '.target'"
        with self.assertRaisesRegexp(ValueError, msg):
            mdf.data = new

        # unable to set ModelFrame with target attribute
        msg = "Cannot update with ModelFrame which has target attribute"
        with self.assertRaisesRegexp(ValueError, msg):
            mdf.data = mdf

        # set delete property
        del mdf.data
        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 1))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['.target']))
        self.assertTrue(mdf.data is None)
        tm.assert_series_equal(mdf.target, s)
        self.assertEqual(mdf.target.name, '.target')
        self.assertEqual(mdf.target_name, '.target')
    def test_split_keep_index(self):
        df = pdml.ModelFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8],
                              'B': [1, 2, 3, 4, 5, 6, 7, 8]},
                             index='a b c d e f g h'.split(' '))
        kf = df.model_selection.KFold(3, random_state=self.random_state)
        folded = [f for f in df.model_selection.split(kf)]
        self.assertEqual(len(folded), 3)
        tm.assert_frame_equal(folded[0][0], df.iloc[3:, :])
        tm.assert_frame_equal(folded[0][1], df.iloc[:3, :])
        tm.assert_frame_equal(folded[1][0], df.iloc[[0, 1, 2, 6, 7], :])
        tm.assert_frame_equal(folded[1][1], df.iloc[3:6, :])
        tm.assert_frame_equal(folded[2][0], df.iloc[:6, :])
        tm.assert_frame_equal(folded[2][1], df.iloc[6:, :])

        folded = [f for f in df.model_selection.split(kf, reset_index=True)]
        self.assertEqual(len(folded), 3)
        tm.assert_frame_equal(folded[0][0],
                              df.iloc[3:, :].reset_index(drop=True))
        tm.assert_frame_equal(folded[0][1],
                              df.iloc[:3, :].reset_index(drop=True))
        tm.assert_frame_equal(folded[1][0],
                              df.iloc[[0, 1, 2, 6, 7], :].reset_index(drop=True))
        tm.assert_frame_equal(folded[1][1],
                              df.iloc[3:6, :].reset_index(drop=True))
        tm.assert_frame_equal(folded[2][0],
                              df.iloc[:6, :].reset_index(drop=True))
        tm.assert_frame_equal(folded[2][1],
                              df.iloc[6:, :].reset_index(drop=True))
Beispiel #44
0
    def test_frame_init_dict_list(self):
        # initialization by dataframe and list
        df = pd.DataFrame({
            'A': [1, 2, 3],
            'B': [4, 5, 6],
            'C': [7, 8, 9]
        },
                          index=['a', 'b', 'c'],
                          columns=['A', 'B', 'C'])
        s = [1, 2, 3]
        mdf = pdml.ModelFrame(df, target=s)
        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 4))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['.target', 'A', 'B',
                                                     'C']))
        tm.assert_frame_equal(mdf.data, df)
        expected = pd.Series([1, 2, 3], index=['a', 'b', 'c'], name='.target')
        tm.assert_series_equal(mdf.target, expected)
        self.assertEqual(mdf.target.name, '.target')
        self.assertEqual(mdf.target_name, '.target')

        df = {'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}
        s = [1, 2, 3]
        mdf = pdml.ModelFrame(df, target=s)
        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 4))
        tm.assert_index_equal(mdf.index, pd.Index([0, 1, 2]))
        tm.assert_index_equal(mdf.columns, pd.Index(['.target', 'A', 'B',
                                                     'C']))
        expected = pd.DataFrame(df)
        tm.assert_frame_equal(mdf.data, expected)
        expected = pd.Series([1, 2, 3], index=[0, 1, 2], name='.target')
        tm.assert_series_equal(mdf.target, expected)
        self.assertEqual(mdf.target.name, '.target')
        self.assertEqual(mdf.target_name, '.target')

        mdf = pdml.ModelFrame(df, target='A')
        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 3))
        tm.assert_index_equal(mdf.index, pd.Index([0, 1, 2]))
        tm.assert_index_equal(mdf.columns, pd.Index(['A', 'B', 'C']))
        expected = pd.DataFrame(df)
        tm.assert_frame_equal(mdf.data, expected[['B', 'C']])
        tm.assert_series_equal(mdf.target, expected['A'])
        self.assertEqual(mdf.target.name, 'A')
        self.assertEqual(mdf.target_name, 'A')

        mdf = pdml.ModelFrame({
            'A': [1, 2, 3],
            'B': [4, 5, 6],
            'C': [7, 8, 9]
        },
                              index=['a', 'b', 'c'],
                              columns=['A', 'B', 'C'])
        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 3))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['A', 'B', 'C']))
        tm.assert_frame_equal(mdf.data, mdf)
        self.assertEqual(mdf.target_name, '.target')
Beispiel #45
0
    def test_frame_target_proparty(self):
        df = pd.DataFrame({'A': [1, 2, 3],
                           'B': [4, 5, 6],
                           'C': [7, 8, 9]},
                          index=['a', 'b', 'c'],
                          columns=['A', 'B', 'C'])
        s = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
        mdf = pdml.ModelFrame(df, target=s)

        new = pd.Series([4, 5, 6], index=['a', 'b', 'c'], name='.target')
        # set target property
        mdf.target = new

        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 4))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['.target', 'A', 'B', 'C']))
        tm.assert_frame_equal(mdf.data, df)
        tm.assert_series_equal(mdf.target, new)
        self.assertEqual(mdf.target.name, '.target')
        self.assertEqual(mdf.target_name, '.target')

        with tm.assert_produces_warning(UserWarning):
            new = pd.Series([4, 5, 6], index=['a', 'b', 'c'], name='xxx')
            # set target property
            mdf.target = new

            self.assertIsInstance(mdf, pdml.ModelFrame)
            self.assertEqual(mdf.shape, (3, 4))
            tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
            tm.assert_index_equal(mdf.columns, pd.Index(['.target', 'A', 'B', 'C']))
            tm.assert_frame_equal(mdf.data, df)

            exp_target = pd.Series(new, name='.target')
            tm.assert_series_equal(mdf.target, exp_target)
            self.assertEqual(mdf.target.name, '.target')
            self.assertEqual(mdf.target_name, '.target')

        new = pd.Series([4, 5, 6], name='.target')
        with self.assertRaisesRegexp(ValueError, 'data and target must have equal index'):
            mdf.target = new

        # set target property
        mdf.target = [7, 8, 9]

        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 4))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['.target', 'A', 'B', 'C']))
        tm.assert_frame_equal(mdf.data, df)
        expected = pd.Series([7, 8, 9], index=['a', 'b', 'c'], name='.target')
        tm.assert_series_equal(mdf.target, expected)
        self.assertEqual(mdf.target.name, '.target')
        self.assertEqual(mdf.target_name, '.target')

        with self.assertRaisesRegexp(ValueError, 'Wrong number of items passed 2, placement implies 3'):
            mdf.target = [1, 2]

        # set target property
        mdf.target = None

        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 3))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['A', 'B', 'C']))
        tm.assert_frame_equal(mdf.data, df)
        self.assertEqual(mdf.target_name, '.target')
Beispiel #46
0
    def test_frame_init_df_target_setter(self):
        # initialization by dataframe and dataframe
        df = pd.DataFrame({
            'A': [1, 2, 3],
            'B': [4, 5, 6],
            'C': [7, 8, 9]
        },
                          index=['a', 'b', 'c'],
                          columns=['A', 'B', 'C'])
        mdf = pdml.ModelFrame(df)
        self.assertFalse(mdf.has_target())
        target = pd.DataFrame({
            't1': [10, 11, 12],
            't2': [13, 14, 15]
        },
                              index=['a', 'b', 'c'])
        mdf.target = target

        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 5))
        expected = pd.DataFrame(
            {
                't1': [10, 11, 12],
                't2': [13, 14, 15],
                'A': [1, 2, 3],
                'B': [4, 5, 6],
                'C': [7, 8, 9]
            },
            index=['a', 'b', 'c'],
            columns=['t1', 't2', 'A', 'B', 'C'])
        tm.assert_frame_equal(mdf, expected)
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns,
                              pd.Index(['t1', 't2', 'A', 'B', 'C']))
        tm.assert_frame_equal(mdf.data, df)
        tm.assert_frame_equal(mdf.target, target)
        tm.assert_index_equal(mdf.target.columns, pd.Index(['t1', 't2']))
        tm.assert_index_equal(mdf.target_name, pd.Index(['t1', 't2']))
        self.assertTrue(mdf.has_multi_targets())

        target = pd.DataFrame({
            'x1': [20, 21, 22],
            'x2': [23, 24, 25]
        },
                              index=['a', 'b', 'c'])

        with tm.assert_produces_warning(UserWarning):
            # when the target has the same length as the target_name,
            # is renamed to existing target ['t1', 't2']
            mdf.target = target

        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 5))
        expected = pd.DataFrame(
            {
                't1': [20, 21, 22],
                't2': [23, 24, 25],
                'A': [1, 2, 3],
                'B': [4, 5, 6],
                'C': [7, 8, 9]
            },
            index=['a', 'b', 'c'],
            columns=['t1', 't2', 'A', 'B', 'C'])
        tm.assert_frame_equal(mdf, expected)
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns,
                              pd.Index(['t1', 't2', 'A', 'B', 'C']))
        tm.assert_frame_equal(mdf.data, df)
        expected = pd.DataFrame({
            't1': [20, 21, 22],
            't2': [23, 24, 25]
        },
                                index=['a', 'b', 'c'])
        tm.assert_frame_equal(mdf.target, expected)
        tm.assert_index_equal(mdf.target.columns, pd.Index(['t1', 't2']))
        tm.assert_index_equal(mdf.target_name, pd.Index(['t1', 't2']))
        self.assertTrue(mdf.has_multi_targets())

        target = pd.DataFrame(
            {
                'x1': [20, 21, 22],
                'x2': [23, 24, 25],
                'x3': [25, 26, 27]
            },
            index=['a', 'b', 'c'])

        # when the target has the different length as the target_name,
        # target is being replaced
        mdf.target = target

        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 6))
        expected = pd.DataFrame(
            {
                'x1': [20, 21, 22],
                'x2': [23, 24, 25],
                'x3': [25, 26, 27],
                'A': [1, 2, 3],
                'B': [4, 5, 6],
                'C': [7, 8, 9]
            },
            index=['a', 'b', 'c'],
            columns=['x1', 'x2', 'x3', 'A', 'B', 'C'])
        tm.assert_frame_equal(mdf, expected)
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns,
                              pd.Index(['x1', 'x2', 'x3', 'A', 'B', 'C']))
        tm.assert_frame_equal(mdf.data, df)
        tm.assert_frame_equal(mdf.target, target)
        tm.assert_index_equal(mdf.target.columns, pd.Index(['x1', 'x2', 'x3']))
        tm.assert_index_equal(mdf.target_name, pd.Index(['x1', 'x2', 'x3']))
        self.assertTrue(mdf.has_multi_targets())
Beispiel #47
0
    def test_split_keep_index(self):
        df = pdml.ModelFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8],
                              'B': [1, 2, 3, 4, 5, 6, 7, 8]},
                             index='a b c d e f g h'.split(' '))
        kf = df.model_selection.KFold(3, random_state=self.random_state)
        folded = [f for f in df.model_selection.split(kf)]
        self.assertEqual(len(folded), 3)
        tm.assert_frame_equal(folded[0][0], df.iloc[3:, :])
        tm.assert_frame_equal(folded[0][1], df.iloc[:3, :])
        tm.assert_frame_equal(folded[1][0], df.iloc[[0, 1, 2, 6, 7], :])
        tm.assert_frame_equal(folded[1][1], df.iloc[3:6, :])
        tm.assert_frame_equal(folded[2][0], df.iloc[:6, :])
        tm.assert_frame_equal(folded[2][1], df.iloc[6:, :])

        folded = [f for f in df.model_selection.split(kf, reset_index=True)]
        self.assertEqual(len(folded), 3)
        tm.assert_frame_equal(folded[0][0],
                              df.iloc[3:, :].reset_index(drop=True))
        tm.assert_frame_equal(folded[0][1],
                              df.iloc[:3, :].reset_index(drop=True))
        tm.assert_frame_equal(folded[1][0],
                              df.iloc[[0, 1, 2, 6, 7], :].reset_index(drop=True))
        tm.assert_frame_equal(folded[1][1],
                              df.iloc[3:6, :].reset_index(drop=True))
        tm.assert_frame_equal(folded[2][0],
                              df.iloc[:6, :].reset_index(drop=True))
        tm.assert_frame_equal(folded[2][1],
                              df.iloc[6:, :].reset_index(drop=True))
Beispiel #48
0
    def test_frame_init_df_target_setter(self):
        # initialization by dataframe and dataframe
        df = pd.DataFrame({'A': [1, 2, 3],
                           'B': [4, 5, 6],
                           'C': [7, 8, 9]},
                          index=['a', 'b', 'c'],
                          columns=['A', 'B', 'C'])
        mdf = pdml.ModelFrame(df)
        self.assertFalse(mdf.has_target())
        target = pd.DataFrame({'t1': [10, 11, 12],
                               't2': [13, 14, 15]},
                              index=['a', 'b', 'c'])
        mdf.target = target

        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 5))
        expected = pd.DataFrame({'t1': [10, 11, 12],
                                 't2': [13, 14, 15],
                                 'A': [1, 2, 3],
                                 'B': [4, 5, 6],
                                 'C': [7, 8, 9]},
                                index=['a', 'b', 'c'],
                                columns=['t1', 't2', 'A', 'B', 'C'])
        tm.assert_frame_equal(mdf, expected)
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns,
                              pd.Index(['t1', 't2', 'A', 'B', 'C']))
        tm.assert_frame_equal(mdf.data, df)
        tm.assert_frame_equal(mdf.target, target)
        tm.assert_index_equal(mdf.target.columns, pd.Index(['t1', 't2']))
        tm.assert_index_equal(mdf.target_name, pd.Index(['t1', 't2']))
        self.assertTrue(mdf.has_multi_targets())

        target = pd.DataFrame({'x1': [20, 21, 22],
                               'x2': [23, 24, 25]},
                              index=['a', 'b', 'c'])

        with tm.assert_produces_warning(UserWarning):
            # when the target has the same length as the target_name,
            # is renamed to existing target ['t1', 't2']
            mdf.target = target

        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 5))
        expected = pd.DataFrame({'t1': [20, 21, 22],
                                 't2': [23, 24, 25],
                                 'A': [1, 2, 3],
                                 'B': [4, 5, 6],
                                 'C': [7, 8, 9]},
                                index=['a', 'b', 'c'],
                                columns=['t1', 't2', 'A', 'B', 'C'])
        tm.assert_frame_equal(mdf, expected)
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns,
                              pd.Index(['t1', 't2', 'A', 'B', 'C']))
        tm.assert_frame_equal(mdf.data, df)
        expected = pd.DataFrame({'t1': [20, 21, 22],
                                 't2': [23, 24, 25]},
                                index=['a', 'b', 'c'])
        tm.assert_frame_equal(mdf.target, expected)
        tm.assert_index_equal(mdf.target.columns, pd.Index(['t1', 't2']))
        tm.assert_index_equal(mdf.target_name, pd.Index(['t1', 't2']))
        self.assertTrue(mdf.has_multi_targets())

        target = pd.DataFrame({'x1': [20, 21, 22],
                               'x2': [23, 24, 25],
                               'x3': [25, 26, 27]},
                              index=['a', 'b', 'c'])

        # when the target has the different length as the target_name,
        # target is being replaced
        mdf.target = target

        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 6))
        expected = pd.DataFrame({'x1': [20, 21, 22],
                                 'x2': [23, 24, 25],
                                 'x3': [25, 26, 27],
                                 'A': [1, 2, 3],
                                 'B': [4, 5, 6],
                                 'C': [7, 8, 9]},
                                index=['a', 'b', 'c'],
                                columns=['x1', 'x2', 'x3', 'A', 'B', 'C'])
        tm.assert_frame_equal(mdf, expected)
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns,
                              pd.Index(['x1', 'x2', 'x3', 'A', 'B', 'C']))
        tm.assert_frame_equal(mdf.data, df)
        tm.assert_frame_equal(mdf.target, target)
        tm.assert_index_equal(mdf.target.columns,
                              pd.Index(['x1', 'x2', 'x3']))
        tm.assert_index_equal(mdf.target_name, pd.Index(['x1', 'x2', 'x3']))
        self.assertTrue(mdf.has_multi_targets())
Beispiel #49
0
    def test_frame_target_proparty(self):
        df = pd.DataFrame({
            'A': [1, 2, 3],
            'B': [4, 5, 6],
            'C': [7, 8, 9]
        },
                          index=['a', 'b', 'c'],
                          columns=['A', 'B', 'C'])
        s = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
        mdf = pdml.ModelFrame(df, target=s)

        new = pd.Series([4, 5, 6], index=['a', 'b', 'c'], name='.target')
        # set target property
        mdf.target = new

        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 4))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['.target', 'A', 'B',
                                                     'C']))
        tm.assert_frame_equal(mdf.data, df)
        tm.assert_series_equal(mdf.target, new)
        self.assertEqual(mdf.target.name, '.target')
        self.assertEqual(mdf.target_name, '.target')

        with tm.assert_produces_warning(UserWarning):
            new = pd.Series([4, 5, 6], index=['a', 'b', 'c'], name='xxx')
            # set target property
            mdf.target = new

            self.assertIsInstance(mdf, pdml.ModelFrame)
            self.assertEqual(mdf.shape, (3, 4))
            tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
            tm.assert_index_equal(mdf.columns,
                                  pd.Index(['.target', 'A', 'B', 'C']))
            tm.assert_frame_equal(mdf.data, df)

            exp_target = pd.Series(new, name='.target')
            tm.assert_series_equal(mdf.target, exp_target)
            self.assertEqual(mdf.target.name, '.target')
            self.assertEqual(mdf.target_name, '.target')

        new = pd.Series([4, 5, 6], name='.target')
        with pytest.raises(ValueError,
                           match='data and target must have equal index'):
            mdf.target = new

        # set target property
        mdf.target = [7, 8, 9]

        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 4))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['.target', 'A', 'B',
                                                     'C']))
        tm.assert_frame_equal(mdf.data, df)
        expected = pd.Series([7, 8, 9], index=['a', 'b', 'c'], name='.target')
        tm.assert_series_equal(mdf.target, expected)
        self.assertEqual(mdf.target.name, '.target')
        self.assertEqual(mdf.target_name, '.target')

        if pdml.compat._PANDAS_ge_023:
            msg = 'Length of passed values is 2, index implies 3'
        else:
            msg = 'Wrong number of items passed 2, placement implies 3'

        with pytest.raises(ValueError, match=msg):
            mdf.target = [1, 2]

        # set target property
        mdf.target = None

        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 3))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['A', 'B', 'C']))
        tm.assert_frame_equal(mdf.data, df)
        self.assertEqual(mdf.target_name, '.target')
    def test_train_test_split_keep_index(self):
        df = pdml.ModelFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8],
                              'B': [1, 2, 3, 4, 5, 6, 7, 8]},
                             index='a b c d e f g h'.split(' '))
        tr, te = df.ms.train_test_split(random_state=self.random_state)
        tm.assert_frame_equal(tr, df.loc[['g', 'a', 'e', 'f', 'd', 'h']])
        tm.assert_frame_equal(te, df.loc[['c', 'b']])

        tr, te = df.ms.train_test_split(random_state=self.random_state, reset_index=True)
        tm.assert_frame_equal(tr, df.loc[['g', 'a', 'e', 'f', 'd', 'h']].reset_index(drop=True))
        tm.assert_frame_equal(te, df.loc[['c', 'b']].reset_index(drop=True))

        df = pdml.ModelFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8],
                              'B': [1, 2, 3, 4, 5, 6, 7, 8]},
                             index='a b c d e f g h'.split(' '),
                             target=[1, 2, 3, 4, 5, 6, 7, 8])
        tr, te = df.ms.train_test_split(random_state=self.random_state)
        tm.assert_frame_equal(tr, df.loc[['g', 'a', 'e', 'f', 'd', 'h']])
        tm.assert_numpy_array_equal(tr.target.values, np.array([7, 1, 5, 6, 4, 8]))
        tm.assert_frame_equal(te, df.loc[['c', 'b']])
        tm.assert_numpy_array_equal(te.target.values, np.array([3, 2]))

        tr, te = df.ms.train_test_split(random_state=self.random_state, reset_index=True)
        tm.assert_frame_equal(tr, df.loc[['g', 'a', 'e', 'f', 'd', 'h']].reset_index(drop=True))
        tm.assert_numpy_array_equal(tr.target.values, np.array([7, 1, 5, 6, 4, 8]))
        tm.assert_frame_equal(te, df.loc[['c', 'b']].reset_index(drop=True))
        tm.assert_numpy_array_equal(te.target.values, np.array([3, 2]))
Beispiel #51
0
    def test_frame_data_proparty(self):
        df = pd.DataFrame({
            'A': [1, 2, 3],
            'B': [4, 5, 6],
            'C': [7, 8, 9]
        },
                          index=['a', 'b', 'c'],
                          columns=['A', 'B', 'C'])
        s = pd.Series([1, 2, 3], index=['a', 'b', 'c'], name='.target')

        mdf = pdml.ModelFrame(df, target=s)
        self.assertIsInstance(mdf, pdml.ModelFrame)

        new = pd.DataFrame({
            'X': [1, 2, 3],
            'Y': [4, 5, 6]
        },
                           index=['a', 'b', 'c'],
                           columns=['X', 'Y'])
        # set data property
        mdf.data = new

        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 3))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['.target', 'X', 'Y']))
        tm.assert_frame_equal(mdf.data, new)
        tm.assert_series_equal(mdf.target, s)
        self.assertEqual(mdf.target.name, '.target')
        self.assertEqual(mdf.target_name, '.target')

        new = pdml.ModelFrame({
            'M': [1, 2, 3],
            'N': [4, 5, 6]
        },
                              index=['a', 'b', 'c'],
                              columns=['M', 'N'])

        # set data property
        mdf.data = new

        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 3))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['.target', 'M', 'N']))
        tm.assert_frame_equal(mdf.data, new)
        tm.assert_series_equal(mdf.target, s)
        self.assertEqual(mdf.target.name, '.target')
        self.assertEqual(mdf.target_name, '.target')

        new = pd.DataFrame({
            '.target': [1, 2, 3],
            'K': [4, 5, 6]
        },
                           index=['a', 'b', 'c'])

        # unable to set data if passed value has the same column as the target
        msg = "Passed data has the same column name as the target '.target'"
        with pytest.raises(ValueError, match=msg):
            mdf.data = new

        # unable to set ModelFrame with target attribute
        msg = "Cannot update with ModelFrame which has target attribute"
        with pytest.raises(ValueError, match=msg):
            mdf.data = mdf

        # set delete property
        del mdf.data
        self.assertIsInstance(mdf, pdml.ModelFrame)
        self.assertEqual(mdf.shape, (3, 1))
        tm.assert_index_equal(mdf.index, pd.Index(['a', 'b', 'c']))
        tm.assert_index_equal(mdf.columns, pd.Index(['.target']))
        self.assertTrue(mdf.data is None)
        tm.assert_series_equal(mdf.target, s)
        self.assertEqual(mdf.target.name, '.target')
        self.assertEqual(mdf.target_name, '.target')