コード例 #1
0
    def test_update(self):
        pdf = pd.DataFrame({"x": [1, 2, 3], "y": [10, 20, 30]})
        kdf = ks.from_pandas(pdf)

        pser = pdf.x
        kser = kdf.x
        pser.update(pd.Series([4, 5, 6]))
        kser.update(ks.Series([4, 5, 6]))
        self.assert_eq(kser.sort_index(), pser.sort_index())
        self.assert_eq(kdf.sort_index(), pdf.sort_index())
コード例 #2
0
    def test_idxmin(self):
        pser = pd.Series(data=[1, 4, 5], index=['A', 'B', 'C'])
        kser = ks.Series(pser)

        self.assertEqual(kser.idxmin(), pser.idxmin())
        self.assertEqual(kser.idxmin(skipna=False), pser.idxmin(skipna=False))

        index = pd.MultiIndex.from_arrays(
            [['a', 'a', 'b', 'b'], ['c', 'd', 'e', 'f']],
            names=('first', 'second'))
        pser = pd.Series(data=[1, 2, 4, 5], index=index)
        kser = ks.Series(pser)

        self.assertEqual(kser.idxmin(), pser.idxmin())
        self.assertEqual(kser.idxmin(skipna=False), pser.idxmin(skipna=False))

        kser = ks.Series([])
        with self.assertRaisesRegex(ValueError, "an empty sequence"):
            kser.idxmin()
コード例 #3
0
 def test_aggregate(self):
     pser = pd.Series([10, 20, 15, 30, 45], name='x')
     kser = ks.Series(pser)
     msg = 'func must be a string or list of strings'
     with self.assertRaisesRegex(ValueError, msg):
         kser.aggregate({'x': ['min', 'max']})
     msg = ('If the given function is a list, it '
            'should only contains function names as strings.')
     with self.assertRaisesRegex(ValueError, msg):
         kser.aggregate(['min', max])
コード例 #4
0
ファイル: test_indexing.py プロジェクト: xiaming9880/koalas
    def test_at(self):
        pdf = self.pdf
        kdf = self.kdf
        # Create the equivalent of pdf.loc[3] as a Koalas Series
        # This is necessary because .loc[n] does not currently work with Koalas DataFrames (#383)
        test_series = ks.Series([3, 6], index=['a', 'b'], name='3')

        # Assert invalided signatures raise TypeError
        with self.assertRaises(TypeError, msg="Use DataFrame.at like .at[row_index, column_name]"):
            kdf.at[3]
        with self.assertRaises(TypeError, msg="Use DataFrame.at like .at[row_index, column_name]"):
            kdf.at['ab']  # 'ab' is of length 2 but str type instead of tuple
        with self.assertRaises(TypeError, msg="Use Series.at like .at[column_name]"):
            test_series.at[3, 'b']

        # Assert .at for DataFrames
        self.assertEqual(kdf.at[3, 'b'], 6)
        self.assertEqual(kdf.at[3, 'b'], pdf.at[3, 'b'])
        np.testing.assert_array_equal(kdf.at[9, 'b'], np.array([0, 0, 0]))
        np.testing.assert_array_equal(kdf.at[9, 'b'], pdf.at[9, 'b'])

        # Assert .at for Series
        self.assertEqual(test_series.at['b'], 6)
        self.assertEqual(test_series.at['b'], pdf.loc[3].at['b'])

        # Assert multi-character indices
        self.assertEqual(ks.Series([0, 1], index=['ab', 'cd']).at['ab'],
                         pd.Series([0, 1], index=['ab', 'cd']).at['ab'])

        # Assert invalid column or index names result in a KeyError like with pandas
        with self.assertRaises(KeyError, msg='x'):
            kdf.at[3, 'x']
        with self.assertRaises(KeyError, msg=99):
            kdf.at[99, 'b']

        with self.assertRaises(ValueError):
            kdf.at[(3, 6), 'b']
        with self.assertRaises(KeyError):
            kdf.at[3, ('x', 'b')]

        # Assert setting values fails
        with self.assertRaises(TypeError):
            kdf.at[3, 'b'] = 10
コード例 #5
0
ファイル: test_series.py プロジェクト: royalosyin/koalas
    def test_xs(self):
        midx = pd.MultiIndex(
            [['a', 'b', 'c'], ['lama', 'cow', 'falcon'],
             ['speed', 'weight', 'length']],
            [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 0, 1, 1, 1, 2, 2, 2],
             [0, 1, 2, 0, 1, 2, 0, 1, 2]])
        kser = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx)
        pser = kser.to_pandas()

        self.assert_eq(kser.xs(('a', 'lama', 'speed')),
                       pser.xs(('a', 'lama', 'speed')))
コード例 #6
0
 def test_multiindex_transform_negative(self):
     with self.assertRaisesRegex(
             NotImplementedError,
             "MultiIndex does not support spark.transform yet"):
         midx = pd.MultiIndex(
             [["lama", "cow", "falcon"], ["speed", "weight", "length"]],
             [[0, 0, 0, 1, 1, 1, 2, 2, 2], [1, 1, 1, 1, 1, 2, 1, 2, 2]],
         )
         s = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
                       index=midx)
         s.index.spark.transform(lambda scol: scol)
コード例 #7
0
ファイル: test_series.py プロジェクト: HG1112/koalas
 def test_shift(self):
     pser = pd.Series([10, 20, 15, 30, 45], name='x')
     kser = ks.Series(pser)
     if LooseVersion(pd.__version__) < LooseVersion('0.24.2'):
         self.assertEqual(repr(kser.shift(periods=2)),
                          repr(pser.shift(periods=2)))
     else:
         self.assertEqual(repr(kser.shift(periods=2, fill_value=0)),
                          repr(pser.shift(periods=2, fill_value=0)))
     with self.assertRaisesRegex(ValueError, 'periods should be an int; however'):
         kser.shift(periods=1.5)
コード例 #8
0
    def test_expanding_count(self):
        # The behaviour of Expanding.count are different between pandas>=1.0.0 and lower,
        # and we're following the behaviour of latest version of pandas.
        if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"):
            self._test_expanding_func("count")
        else:
            # Series
            idx = np.random.rand(3)
            kser = ks.Series([1, 2, 3], index=idx, name="a")
            expected_result = pd.Series([None, 2.0, 3.0], index=idx, name="a")
            self.assert_eq(
                kser.expanding(2).count().sort_index(), expected_result.sort_index(), almost=True
            )
            self.assert_eq(kser.expanding(2).count().sum(), expected_result.sum(), almost=True)

            # MultiIndex
            midx = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
            kser = ks.Series([1, 2, 3], index=midx, name="a")
            expected_result = pd.Series([None, 2.0, 3.0], index=midx, name="a")
            self.assert_eq(
                kser.expanding(2).count().sort_index(), expected_result.sort_index(), almost=True
            )

            # DataFrame
            kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]})
            expected_result = pd.DataFrame({"a": [None, 2.0, 3.0, 4.0], "b": [None, 2.0, 3.0, 4.0]})
            self.assert_eq(
                kdf.expanding(2).count().sort_index(), expected_result.sort_index(), almost=True
            )
            self.assert_eq(kdf.expanding(2).count().sum(), expected_result.sum(), almost=True)

            # MultiIndex columns
            idx = np.random.rand(4)
            kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]}, index=idx)
            kdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
            expected_result = pd.DataFrame(
                {("a", "x"): [None, 2.0, 3.0, 4.0], ("a", "y"): [None, 2.0, 3.0, 4.0]}, index=idx,
            )
            self.assert_eq(
                kdf.expanding(2).count().sort_index(), expected_result.sort_index(), almost=True
            )
コード例 #9
0
    def _test_groupby_expanding_func(self, f):
        kser = ks.Series([1, 2, 3])
        pser = kser.to_pandas()
        self.assert_eq(
            repr(getattr(kser.groupby(kser).expanding(2), f)().sort_index()),
            repr(getattr(pser.groupby(pser).expanding(2), f)()))

        # Multiindex
        kser = ks.Series(
            [1, 2, 3],
            index=pd.MultiIndex.from_tuples([('a', 'x'), ('a', 'y'), ('b', 'z')]))
        pser = kser.to_pandas()
        self.assert_eq(
            repr(getattr(kser.groupby(kser).expanding(2), f)().sort_index()),
            repr(getattr(pser.groupby(pser).expanding(2), f)()))

        kdf = ks.DataFrame({'a': [1, 2, 3, 2], 'b': [4.0, 2.0, 3.0, 1.0]})
        pdf = kdf.to_pandas()
        self.assert_eq(
            repr(getattr(kdf.groupby(kdf.a).expanding(2), f)().sort_index()),
            repr(getattr(pdf.groupby(pdf.a).expanding(2), f)()))
コード例 #10
0
    def test_assignment_series(self):
        kdf = ks.from_pandas(self.pdf1)
        pdf = self.pdf1
        kser = kdf.a
        pser = pdf.a
        kdf["a"] = self.kdf2.a
        pdf["a"] = self.pdf2.a

        self.assert_eq(kdf.sort_index(), pdf.sort_index())
        self.assert_eq(kser, pser)

        kdf = ks.from_pandas(self.pdf1)
        pdf = self.pdf1
        kser = kdf.a
        pser = pdf.a
        kdf["a"] = self.kdf2.b
        pdf["a"] = self.pdf2.b

        self.assert_eq(kdf.sort_index(), pdf.sort_index())
        self.assert_eq(kser, pser)

        kdf = ks.from_pandas(self.pdf1)
        pdf = self.pdf1
        kdf["c"] = self.kdf2.a
        pdf["c"] = self.pdf2.a

        self.assert_eq(kdf.sort_index(), pdf.sort_index())

        # Multi-index columns
        kdf = ks.from_pandas(self.pdf1)
        pdf = self.pdf1
        columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b")])
        kdf.columns = columns
        pdf.columns = columns
        kdf[("y", "c")] = self.kdf2.a
        pdf[("y", "c")] = self.pdf2.a

        self.assert_eq(kdf.sort_index(), pdf.sort_index())

        pdf = pd.DataFrame({
            "a": [1, 2, 3],
            "Koalas": [0, 1, 2]
        }).set_index("Koalas", drop=False)
        kdf = ks.from_pandas(pdf)

        kdf.index.name = None
        kdf["NEW"] = ks.Series([100, 200, 300])

        pdf.index.name = None
        pdf["NEW"] = pd.Series([100, 200, 300])

        self.assert_eq(kdf.sort_index(), pdf.sort_index())
コード例 #11
0
 def test_overwrite_warns(self):
     mean = ks.Series.mean
     try:
         with assert_produces_warning(UserWarning) as w:
             register_series_accessor("mean")(CustomAccessor)
             s = ks.Series([1, 2])
             assert s.mean.prop == "item"
         msg = str(w[0].message)
         assert "mean" in msg
         assert "CustomAccessor" in msg
         assert "Series" in msg
     finally:
         ks.Series.mean = mean
コード例 #12
0
    def _test_expanding_func(self, f):
        kser = ks.Series([1, 2, 3], index=np.random.rand(3))
        pser = kser.to_pandas()
        self.assert_eq(repr(getattr(kser.expanding(2), f)()), repr(getattr(pser.expanding(2), f)()))

        # Multiindex
        kser = ks.Series(
            [1, 2, 3],
            index=pd.MultiIndex.from_tuples([('a', 'x'), ('a', 'y'), ('b', 'z')]))
        pser = kser.to_pandas()
        self.assert_eq(repr(getattr(kser.expanding(2), f)()), repr(getattr(pser.expanding(2), f)()))

        kdf = ks.DataFrame({'a': [1, 2, 3, 2], 'b': [4.0, 2.0, 3.0, 1.0]})
        pdf = kdf.to_pandas()
        self.assert_eq(repr(getattr(kdf.expanding(2), f)()), repr(getattr(pdf.expanding(2), f)()))

        # Multiindex column
        kdf = ks.DataFrame({'a': [1, 2, 3, 2], 'b': [4.0, 2.0, 3.0, 1.0]},
                           index=np.random.rand(4))
        kdf.columns = pd.MultiIndex.from_tuples([('a', 'x'), ('a', 'y')])
        pdf = kdf.to_pandas()
        self.assert_eq(repr(getattr(kdf.expanding(2), f)()), repr(getattr(pdf.expanding(2), f)()))
コード例 #13
0
    def test_dot(self):
        pser = pd.Series([90, 91, 85], index=[2, 4, 1])
        kser = ks.from_pandas(pser)
        pser_other = pd.Series([90, 91, 85], index=[2, 4, 1])
        kser_other = ks.from_pandas(pser_other)

        self.assert_eq(kser.dot(kser_other), pser.dot(pser_other))

        kser_other = ks.Series([90, 91, 85], index=[1, 2, 4])
        pser_other = pd.Series([90, 91, 85], index=[1, 2, 4])

        self.assert_eq(kser.dot(kser_other), pser.dot(pser_other))

        # length of index is different
        kser_other = ks.Series([90, 91, 85, 100], index=[2, 4, 1, 0])
        with self.assertRaisesRegex(ValueError, "matrices are not aligned"):
            kser.dot(kser_other)

        # with DataFram is not supported for now since performance issue,
        # now we raise ValueError with proper message instead.
        kdf = ks.DataFrame([[0, 1], [-2, 3], [4, -5]], index=[2, 4, 1])

        with self.assertRaisesRegex(
                ValueError, r"Series\.dot\(\) is currently not supported*"):
            kser.dot(kdf)

        # for MultiIndex
        midx = pd.MultiIndex(
            [["lama", "cow", "falcon"], ["speed", "weight", "length"]],
            [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]],
        )
        pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx)
        kser = ks.from_pandas(pser)
        pser_other = pd.Series([-450, 20, 12, -30, -250, 15, -320, 100, 3],
                               index=midx)
        kser_other = ks.from_pandas(pser_other)

        self.assert_eq(kser.dot(kser_other), pser.dot(pser_other))
コード例 #14
0
    def _test_groupby_rolling_func(self, f):
        kser = ks.Series([1, 2, 3], index=np.random.rand(3))
        pser = kser.to_pandas()
        self.assert_eq(
            repr(getattr(kser.groupby(kser).rolling(2), f)().sort_index()),
            repr(getattr(pser.groupby(pser).rolling(2), f)().sort_index()),
        )

        # Multiindex
        kser = ks.Series(
            [1, 2, 3], index=pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])
        )
        pser = kser.to_pandas()
        self.assert_eq(
            repr(getattr(kser.groupby(kser).rolling(2), f)().sort_index()),
            repr(getattr(pser.groupby(pser).rolling(2), f)()),
        )

        kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]})
        pdf = kdf.to_pandas()
        self.assert_eq(
            repr(getattr(kdf.groupby(kdf.a).rolling(2), f)().sort_index()),
            repr(getattr(pdf.groupby(pdf.a).rolling(2), f)().sort_index()),
        )

        # Multiindex column
        kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]})
        kdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
        pdf = kdf.to_pandas()
        self.assert_eq(
            repr(getattr(kdf.groupby(("a", "x")).rolling(2), f)().sort_index()),
            repr(getattr(pdf.groupby(("a", "x")).rolling(2), f)().sort_index()),
        )

        self.assert_eq(
            repr(getattr(kdf.groupby([("a", "x"), ("a", "y")]).rolling(2), f)().sort_index()),
            repr(getattr(pdf.groupby([("a", "x"), ("a", "y")]).rolling(2), f)().sort_index()),
        )
コード例 #15
0
    def test_broadcast(self):
        kdf = ks.DataFrame(
            {
                "key": ["K0", "K1", "K2", "K3"],
                "A": ["A0", "A1", "A2", "A3"]
            },
            columns=["key", "A"])
        self.assert_eq(kdf, ks.broadcast(kdf))

        kser = ks.Series([1, 2, 3])
        expected_error_message = "Invalid type : expected DataFrame got {}".format(
            type(kser).__name__)
        with self.assertRaisesRegex(ValueError, expected_error_message):
            ks.broadcast(kser)
コード例 #16
0
ファイル: test_series.py プロジェクト: royalosyin/koalas
    def test_index(self):
        # to check setting name of Index properly.
        idx = pd.Index([1, 2, 3, 4, 5, 6, 7, 8, 9])
        kser = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=idx)
        pser = kser.to_pandas()

        kser.name = 'koalas'
        pser.name = 'koalas'
        self.assert_eq(kser.index.name, pser.index.name)

        # for check setting names of MultiIndex properly.
        kser.names = ['hello', 'koalas']
        pser.names = ['hello', 'koalas']
        self.assert_eq(kser.index.names, pser.index.names)
コード例 #17
0
    def test_truncate(self):
        pser1 = pd.Series([10, 20, 30, 40, 50, 60, 70], index=[1, 2, 3, 4, 5, 6, 7])
        kser1 = ks.Series(pser1)
        pser2 = pd.Series([10, 20, 30, 40, 50, 60, 70], index=[7, 6, 5, 4, 3, 2, 1])
        kser2 = ks.Series(pser2)

        self.assert_eq(kser1.truncate(), pser1.truncate())
        self.assert_eq(kser1.truncate(before=2), pser1.truncate(before=2))
        self.assert_eq(kser1.truncate(after=5), pser1.truncate(after=5))
        self.assert_eq(kser1.truncate(copy=False), pser1.truncate(copy=False))
        self.assert_eq(kser1.truncate(2, 5, copy=False), pser1.truncate(2, 5, copy=False))
        self.assert_eq(kser2.truncate(4, 6), pser2.truncate(4, 6))
        self.assert_eq(kser2.truncate(4, 6, copy=False), pser2.truncate(4, 6, copy=False))

        kser = ks.Series([10, 20, 30, 40, 50, 60, 70], index=[1, 2, 3, 4, 3, 2, 1])
        msg = "truncate requires a sorted index"
        with self.assertRaisesRegex(ValueError, msg):
            kser.truncate()

        kser = ks.Series([10, 20, 30, 40, 50, 60, 70], index=[1, 2, 3, 4, 5, 6, 7])
        msg = "Truncate: 2 must be after 5"
        with self.assertRaisesRegex(ValueError, msg):
            kser.truncate(5, 2)
コード例 #18
0
    def test_getitem(self):
        pser = pd.Series([10, 20, 15, 30, 45], ['A', 'A', 'B', 'C', 'D'])
        kser = ks.Series(pser)

        self.assert_eq(kser['A'], pser['A'])
        self.assert_eq(kser['B'], pser['B'])

        # for MultiIndex
        midx = pd.MultiIndex([['a', 'b', 'c'],
                              ['lama', 'cow', 'falcon'],
                              ['speed', 'weight', 'length']],
                             [[0, 0, 0, 0, 0, 0, 1, 1, 1],
                              [0, 0, 0, 1, 1, 1, 2, 2, 2],
                              [0, 0, 0, 0, 1, 2, 0, 1, 2]])
        pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
                         name='0', index=midx)
        kser = ks.Series(pser)

        self.assert_eq(kser['a'], pser['a'])
        self.assert_eq(kser['a', 'lama'], pser['a', 'lama'])

        msg = r"'Key length \(4\) exceeds index depth \(3\)'"
        with self.assertRaisesRegex(KeyError, msg):
            kser[('a', 'lama', 'speed', 'x')]
コード例 #19
0
ファイル: test_indexing.py プロジェクト: zuoxiaolei/koalas
    def test_iat(self):
        pdf = self.pdf
        kdf = self.kdf
        # Create the equivalent of pdf.loc[3] as a Koalas Series
        # This is necessary because .loc[n] does not currently work with Koalas DataFrames (#383)
        test_series = ks.Series([3, 6], index=["a", "b"], name="3")

        # Assert invalided signatures raise TypeError
        with self.assertRaises(
                TypeError,
                msg=
                "Use DataFrame.at like .iat[row_interget_position, column_integer_position]",
        ):
            kdf.iat[3]
        with self.assertRaises(
                ValueError,
                msg=
                "iAt based indexing on multi-index can only have tuple values"
        ):
            kdf.iat[3,
                    "b"]  # 'ab' is of length 2 but str type instead of tuple
        with self.assertRaises(
                TypeError,
                msg="Use Series.iat like .iat[row_integer_position]"):
            test_series.iat[3, "b"]

        # Assert .iat for DataFrames
        self.assertEqual(kdf.iat[7, 0], 8)
        self.assertEqual(kdf.iat[7, 0], pdf.iat[7, 0])

        # Assert .iat for Series
        self.assertEqual(test_series.iat[1], 6)
        self.assertEqual(test_series.iat[1], pdf.loc[3].iat[1])

        # Assert invalid column or integer position result in a KeyError like with pandas
        with self.assertRaises(KeyError, msg=99):
            kdf.iat[0, 99]
        with self.assertRaises(KeyError, msg=99):
            kdf.iat[99, 0]

        with self.assertRaises(ValueError):
            kdf.iat[(1, 1), 1]
        with self.assertRaises(ValueError):
            kdf.iat[1, (1, 1)]

        # Assert setting values fails
        with self.assertRaises(TypeError):
            kdf.iat[4, 1] = 10
コード例 #20
0
ファイル: test_series.py プロジェクト: HG1112/koalas
    def test_replace(self):
        pser = pd.Series([10, 20, 15, 30, 45], name='x')
        kser = ks.Series(pser)

        self.assert_eq(kser.replace(), pser.replace())
        self.assert_eq(kser.replace({}), pser.replace({}))

        msg = "'to_replace' should be one of str, list, dict, int, float"
        with self.assertRaisesRegex(ValueError, msg):
            kser.replace(ks.range(5))
        msg = "Replacement lists must match in length. Expecting 3 got 2"
        with self.assertRaisesRegex(ValueError, msg):
            kser.replace([10, 20, 30], [1, 2])
        msg = "replace currently not support for regex"
        with self.assertRaisesRegex(NotImplementedError, msg):
            kser.replace(r'^1.$', regex=True)
コード例 #21
0
ファイル: test_series.py プロジェクト: royalosyin/koalas
    def test_pop(self):
        midx = pd.MultiIndex(
            [['lama', 'cow', 'falcon'], ['speed', 'weight', 'length']],
            [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]])
        kser = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx)
        pser = kser.to_pandas()

        self.assert_eq(kser.pop(('lama', 'speed')), pser.pop(
            ('lama', 'speed')))

        msg = "'key' should be string or tuple that contains strings"
        with self.assertRaisesRegex(ValueError, msg):
            kser.pop(0)
        msg = ("'key' should have index names as only strings "
               "or a tuple that contain index names as only strings")
        with self.assertRaisesRegex(ValueError, msg):
            kser.pop(('lama', 0))
        msg = r"'Key length \(3\) exceeds index depth \(2\)'"
        with self.assertRaisesRegex(KeyError, msg):
            kser.pop(('lama', 'speed', 'x'))
コード例 #22
0
    def test_get_dummies(self):
        for pdf_or_ps in [
                pd.Series([1, 1, 1, 2, 2, 1, 3, 4]),
                # pd.Series([1, 1, 1, 2, 2, 1, 3, 4], dtype='category'),
                # pd.Series(pd.Categorical([1, 1, 1, 2, 2, 1, 3, 4],
                #                          categories=[4, 3, 2, 1])),
                pd.DataFrame({
                    "a": [1, 2, 3, 4, 4, 3, 2, 1],
                    # 'b': pd.Categorical(list('abcdabcd')),
                    "b": list("abcdabcd"),
                }),
        ]:
            kdf_or_kser = ks.from_pandas(pdf_or_ps)

            self.assert_eq(ks.get_dummies(kdf_or_kser),
                           pd.get_dummies(pdf_or_ps, dtype=np.int8))

        kser = ks.Series([1, 1, 1, 2, 2, 1, 3, 4])
        with self.assertRaisesRegex(
                NotImplementedError,
                "get_dummies currently does not support sparse"):
            ks.get_dummies(kser, sparse=True)
コード例 #23
0
ファイル: test_series.py プロジェクト: HG1112/koalas
    def test_clip(self):
        pser = pd.Series([0, 2, 4])
        kser = ks.from_pandas(pser)

        # Assert list-like values are not accepted for 'lower' and 'upper'
        msg = "List-like value are not supported for 'lower' and 'upper' at the moment"
        with self.assertRaises(ValueError, msg=msg):
            kser.clip(lower=[1])
        with self.assertRaises(ValueError, msg=msg):
            kser.clip(upper=[1])

        # Assert no lower or upper
        self.assert_eq(kser.clip(), pser.clip())
        # Assert lower only
        self.assert_eq(kser.clip(1), pser.clip(1))
        # Assert upper only
        self.assert_eq(kser.clip(upper=3), pser.clip(upper=3))
        # Assert lower and upper
        self.assert_eq(kser.clip(1, 3), pser.clip(1, 3))

        # Assert behavior on string values
        str_kser = ks.Series(['a', 'b', 'c'])
        self.assert_eq(str_kser.clip(1, 3), str_kser)
コード例 #24
0
    def test_dot(self):
        pser = pd.Series([90, 91, 85], index=[2, 4, 1])
        kser = ks.from_pandas(pser)
        pser_other = pd.Series([90, 91, 85], index=[2, 4, 1])
        kser_other = ks.from_pandas(pser_other)

        self.assert_eq(kser.dot(kser_other), pser.dot(pser_other))

        kser_other = ks.Series([90, 91, 85], index=[1, 2, 4])
        pser_other = pd.Series([90, 91, 85], index=[1, 2, 4])

        self.assert_eq(kser.dot(kser_other), pser.dot(pser_other))

        # length of index is different
        kser_other = ks.Series([90, 91, 85, 100], index=[2, 4, 1, 0])
        with self.assertRaisesRegex(ValueError, "matrices are not aligned"):
            kser.dot(kser_other)

        # for MultiIndex
        midx = pd.MultiIndex(
            [["lama", "cow", "falcon"], ["speed", "weight", "length"]],
            [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]],
        )
        pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx)
        kser = ks.from_pandas(pser)
        pser_other = pd.Series([-450, 20, 12, -30, -250, 15, -320, 100, 3],
                               index=midx)
        kser_other = ks.from_pandas(pser_other)
        self.assert_eq(kser.dot(kser_other), pser.dot(pser_other))

        pser = pd.Series([0, 1, 2, 3])
        kser = ks.from_pandas(pser)

        # DataFrame "other" without Index/MultiIndex as columns
        pdf = pd.DataFrame([[0, 1], [-2, 3], [4, -5], [6, 7]])
        kdf = ks.from_pandas(pdf)
        self.assert_eq(kser.dot(kdf), pser.dot(pdf))

        # DataFrame "other" with Index as columns
        pdf.columns = pd.Index(["x", "y"])
        kdf = ks.from_pandas(pdf)
        self.assert_eq(kser.dot(kdf), pser.dot(pdf))
        pdf.columns = pd.Index(["x", "y"], name="cols_name")
        kdf = ks.from_pandas(pdf)
        self.assert_eq(kser.dot(kdf), pser.dot(pdf))

        pdf = pdf.reindex([1, 0, 2, 3])
        kdf = ks.from_pandas(pdf)
        self.assert_eq(kser.dot(kdf), pser.dot(pdf))

        # DataFrame "other" with MultiIndex as columns
        pdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y")])
        kdf = ks.from_pandas(pdf)
        self.assert_eq(kser.dot(kdf), pser.dot(pdf))
        pdf.columns = pd.MultiIndex.from_tuples(
            [("a", "x"), ("b", "y")], names=["cols_name1", "cols_name2"])
        kdf = ks.from_pandas(pdf)
        self.assert_eq(kser.dot(kdf), pser.dot(pdf))

        kser = ks.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}).b
        pser = kser.to_pandas()
        kdf = ks.DataFrame({"c": [7, 8, 9]})
        pdf = kdf.to_pandas()
        self.assert_eq(kser.dot(kdf), pser.dot(pdf))
コード例 #25
0
ファイル: test_series.py プロジェクト: HG1112/koalas
 def test_astype(self):
     pser = pd.Series([10, 20, 15, 30, 45], name='x')
     kser = ks.Series(pser)
     with self.assertRaisesRegex(ValueError, 'Type int63 not understood'):
         kser.astype('int63')
コード例 #26
0
ファイル: test_series.py プロジェクト: HG1112/koalas
 def test_median(self):
     with self.assertRaisesRegex(ValueError, "accuracy must be an integer; however"):
         ks.Series([24., 21., 25., 33., 26.]).median(accuracy="a")
コード例 #27
0
ファイル: test_series.py プロジェクト: zhouzach/koalas
 def test_nlargest(self):
     sample_lst = [1, 2, 3, 4, np.nan, 6]
     ps = pd.Series(sample_lst, name='x')
     ks = koalas.Series(sample_lst, name='x')
     self.assert_eq(ks.nlargest(n=3), ps.nlargest(n=3))
     self.assert_eq(ks.nlargest(), ps.nlargest())
コード例 #28
0
ファイル: test_expanding.py プロジェクト: zuoxiaolei/koalas
 def test_groupby_expanding_count(self):
     # The behaviour of ExpandingGroupby.count are different between pandas>=1.0.0 and lower,
     # and we're following the behaviour of latest version of pandas.
     if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"):
         self._test_groupby_expanding_func("count")
     else:
         # Series
         kser = ks.Series([1, 2, 3], index=np.random.rand(3))
         midx = pd.MultiIndex.from_tuples(
             list(
                 zip(kser.to_pandas().values,
                     kser.index.to_pandas().values)))
         expected_result = ks.Series([np.nan, np.nan, np.nan], index=midx)
         self.assert_eq(
             kser.groupby(kser).expanding(2).count().sort_index(),
             expected_result.sort_index(),
             almost=True,
         )
         # MultiIndex
         kser = ks.Series([1, 2, 3],
                          index=pd.MultiIndex.from_tuples([("a", "x"),
                                                           ("a", "y"),
                                                           ("b", "z")]))
         midx = pd.MultiIndex.from_tuples([(1, "a", "x"), (2, "a", "y"),
                                           (3, "b", "z")])
         expected_result = ks.Series([np.nan, np.nan, np.nan], index=midx)
         self.assert_eq(
             kser.groupby(kser).expanding(2).count().sort_index(),
             expected_result.sort_index(),
             almost=True,
         )
         # DataFrame
         kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]})
         midx = pd.MultiIndex.from_tuples([(1, 0), (2, 1), (2, 3), (3, 2)])
         expected_result = ks.DataFrame(
             {
                 "a": [None, None, 2.0, None],
                 "b": [None, None, 2.0, None]
             },
             index=midx)
         self.assert_eq(
             kdf.groupby(kdf.a).expanding(2).count().sort_index(),
             expected_result.sort_index(),
             almost=True,
         )
         # MultiIndex column
         kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]})
         kdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")])
         midx = pd.MultiIndex.from_tuples([(1, 0), (2, 1), (2, 3), (3, 2)])
         expected_result = ks.DataFrame(
             {
                 "a": [None, None, 2.0, None],
                 "b": [None, None, 2.0, None]
             },
             index=midx)
         expected_result.columns = pd.MultiIndex.from_tuples([("a", "x"),
                                                              ("a", "y")])
         self.assert_eq(
             kdf.groupby(("a", "x")).expanding(2).count().sort_index(),
             expected_result.sort_index(),
             almost=True,
         )
         midx = pd.MultiIndex.from_tuples([(1, 4.0, 0), (2, 1.0, 3),
                                           (2, 2.0, 1), (3, 3.0, 2)])
         expected_result = ks.DataFrame(
             {
                 "a": [np.nan, np.nan, np.nan, np.nan],
                 "b": [np.nan, np.nan, np.nan, np.nan]
             },
             index=midx,
         )
         expected_result.columns = pd.MultiIndex.from_tuples([("a", "x"),
                                                              ("a", "y")])
         self.assert_eq(
             kdf.groupby([("a", "x"),
                          ("a", "y")]).expanding(2).count().sort_index(),
             expected_result.sort_index(),
             almost=True,
         )
コード例 #29
0
df["prediction"] = model.predict(df)

stop = datetime.now()

print("Temps préparation et inférence (ML) : ", (stop - start).seconds, "s")

# %%
##### 7e changement : Il faut donc recalculer le score nous même

from databricks.koalas.config import set_option, reset_option

set_option("compute.ops_on_diff_frames", True)

# Score : The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum()

reel = ks.Series(y_test).to_frame().rename(columns={0: 'Reel'})
result = ks.concat([df, reel], axis=1)

result['square_diff_true_pred'] = (result['Reel'] - result['prediction'])**2
u = result['square_diff_true_pred'].sum()
v = ((result['Reel'] - result['Reel'].mean())**2).sum()

score = (1 - u / v)
print(f"score: {score}")

# %% [markdown]
# ## Entrainement et inférence avec Pipeline
# %% [markdown]
# Seuls les modèles entrainés et les prédictions peuvent être utilisés avec koalas

# %%
コード例 #30
0
 def test_unsupported_type(self):
     self.assertRaisesRegex(
         ValueError, "Cannot call DatetimeMethods on type LongType", lambda: ks.Series([0]).dt
     )