def test_update(self): pdf = pd.DataFrame({"x": [1, 2, 3], "y": [10, 20, 30]}) kdf = ks.from_pandas(pdf) pser = pdf.x kser = kdf.x pser.update(pd.Series([4, 5, 6])) kser.update(ks.Series([4, 5, 6])) self.assert_eq(kser.sort_index(), pser.sort_index()) self.assert_eq(kdf.sort_index(), pdf.sort_index())
def test_idxmin(self): pser = pd.Series(data=[1, 4, 5], index=['A', 'B', 'C']) kser = ks.Series(pser) self.assertEqual(kser.idxmin(), pser.idxmin()) self.assertEqual(kser.idxmin(skipna=False), pser.idxmin(skipna=False)) index = pd.MultiIndex.from_arrays( [['a', 'a', 'b', 'b'], ['c', 'd', 'e', 'f']], names=('first', 'second')) pser = pd.Series(data=[1, 2, 4, 5], index=index) kser = ks.Series(pser) self.assertEqual(kser.idxmin(), pser.idxmin()) self.assertEqual(kser.idxmin(skipna=False), pser.idxmin(skipna=False)) kser = ks.Series([]) with self.assertRaisesRegex(ValueError, "an empty sequence"): kser.idxmin()
def test_aggregate(self): pser = pd.Series([10, 20, 15, 30, 45], name='x') kser = ks.Series(pser) msg = 'func must be a string or list of strings' with self.assertRaisesRegex(ValueError, msg): kser.aggregate({'x': ['min', 'max']}) msg = ('If the given function is a list, it ' 'should only contains function names as strings.') with self.assertRaisesRegex(ValueError, msg): kser.aggregate(['min', max])
def test_at(self): pdf = self.pdf kdf = self.kdf # Create the equivalent of pdf.loc[3] as a Koalas Series # This is necessary because .loc[n] does not currently work with Koalas DataFrames (#383) test_series = ks.Series([3, 6], index=['a', 'b'], name='3') # Assert invalided signatures raise TypeError with self.assertRaises(TypeError, msg="Use DataFrame.at like .at[row_index, column_name]"): kdf.at[3] with self.assertRaises(TypeError, msg="Use DataFrame.at like .at[row_index, column_name]"): kdf.at['ab'] # 'ab' is of length 2 but str type instead of tuple with self.assertRaises(TypeError, msg="Use Series.at like .at[column_name]"): test_series.at[3, 'b'] # Assert .at for DataFrames self.assertEqual(kdf.at[3, 'b'], 6) self.assertEqual(kdf.at[3, 'b'], pdf.at[3, 'b']) np.testing.assert_array_equal(kdf.at[9, 'b'], np.array([0, 0, 0])) np.testing.assert_array_equal(kdf.at[9, 'b'], pdf.at[9, 'b']) # Assert .at for Series self.assertEqual(test_series.at['b'], 6) self.assertEqual(test_series.at['b'], pdf.loc[3].at['b']) # Assert multi-character indices self.assertEqual(ks.Series([0, 1], index=['ab', 'cd']).at['ab'], pd.Series([0, 1], index=['ab', 'cd']).at['ab']) # Assert invalid column or index names result in a KeyError like with pandas with self.assertRaises(KeyError, msg='x'): kdf.at[3, 'x'] with self.assertRaises(KeyError, msg=99): kdf.at[99, 'b'] with self.assertRaises(ValueError): kdf.at[(3, 6), 'b'] with self.assertRaises(KeyError): kdf.at[3, ('x', 'b')] # Assert setting values fails with self.assertRaises(TypeError): kdf.at[3, 'b'] = 10
def test_xs(self): midx = pd.MultiIndex( [['a', 'b', 'c'], ['lama', 'cow', 'falcon'], ['speed', 'weight', 'length']], [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]]) kser = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) pser = kser.to_pandas() self.assert_eq(kser.xs(('a', 'lama', 'speed')), pser.xs(('a', 'lama', 'speed')))
def test_multiindex_transform_negative(self): with self.assertRaisesRegex( NotImplementedError, "MultiIndex does not support spark.transform yet"): midx = pd.MultiIndex( [["lama", "cow", "falcon"], ["speed", "weight", "length"]], [[0, 0, 0, 1, 1, 1, 2, 2, 2], [1, 1, 1, 1, 1, 2, 1, 2, 2]], ) s = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) s.index.spark.transform(lambda scol: scol)
def test_shift(self): pser = pd.Series([10, 20, 15, 30, 45], name='x') kser = ks.Series(pser) if LooseVersion(pd.__version__) < LooseVersion('0.24.2'): self.assertEqual(repr(kser.shift(periods=2)), repr(pser.shift(periods=2))) else: self.assertEqual(repr(kser.shift(periods=2, fill_value=0)), repr(pser.shift(periods=2, fill_value=0))) with self.assertRaisesRegex(ValueError, 'periods should be an int; however'): kser.shift(periods=1.5)
def test_expanding_count(self): # The behaviour of Expanding.count are different between pandas>=1.0.0 and lower, # and we're following the behaviour of latest version of pandas. if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"): self._test_expanding_func("count") else: # Series idx = np.random.rand(3) kser = ks.Series([1, 2, 3], index=idx, name="a") expected_result = pd.Series([None, 2.0, 3.0], index=idx, name="a") self.assert_eq( kser.expanding(2).count().sort_index(), expected_result.sort_index(), almost=True ) self.assert_eq(kser.expanding(2).count().sum(), expected_result.sum(), almost=True) # MultiIndex midx = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")]) kser = ks.Series([1, 2, 3], index=midx, name="a") expected_result = pd.Series([None, 2.0, 3.0], index=midx, name="a") self.assert_eq( kser.expanding(2).count().sort_index(), expected_result.sort_index(), almost=True ) # DataFrame kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]}) expected_result = pd.DataFrame({"a": [None, 2.0, 3.0, 4.0], "b": [None, 2.0, 3.0, 4.0]}) self.assert_eq( kdf.expanding(2).count().sort_index(), expected_result.sort_index(), almost=True ) self.assert_eq(kdf.expanding(2).count().sum(), expected_result.sum(), almost=True) # MultiIndex columns idx = np.random.rand(4) kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]}, index=idx) kdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")]) expected_result = pd.DataFrame( {("a", "x"): [None, 2.0, 3.0, 4.0], ("a", "y"): [None, 2.0, 3.0, 4.0]}, index=idx, ) self.assert_eq( kdf.expanding(2).count().sort_index(), expected_result.sort_index(), almost=True )
def _test_groupby_expanding_func(self, f): kser = ks.Series([1, 2, 3]) pser = kser.to_pandas() self.assert_eq( repr(getattr(kser.groupby(kser).expanding(2), f)().sort_index()), repr(getattr(pser.groupby(pser).expanding(2), f)())) # Multiindex kser = ks.Series( [1, 2, 3], index=pd.MultiIndex.from_tuples([('a', 'x'), ('a', 'y'), ('b', 'z')])) pser = kser.to_pandas() self.assert_eq( repr(getattr(kser.groupby(kser).expanding(2), f)().sort_index()), repr(getattr(pser.groupby(pser).expanding(2), f)())) kdf = ks.DataFrame({'a': [1, 2, 3, 2], 'b': [4.0, 2.0, 3.0, 1.0]}) pdf = kdf.to_pandas() self.assert_eq( repr(getattr(kdf.groupby(kdf.a).expanding(2), f)().sort_index()), repr(getattr(pdf.groupby(pdf.a).expanding(2), f)()))
def test_assignment_series(self): kdf = ks.from_pandas(self.pdf1) pdf = self.pdf1 kser = kdf.a pser = pdf.a kdf["a"] = self.kdf2.a pdf["a"] = self.pdf2.a self.assert_eq(kdf.sort_index(), pdf.sort_index()) self.assert_eq(kser, pser) kdf = ks.from_pandas(self.pdf1) pdf = self.pdf1 kser = kdf.a pser = pdf.a kdf["a"] = self.kdf2.b pdf["a"] = self.pdf2.b self.assert_eq(kdf.sort_index(), pdf.sort_index()) self.assert_eq(kser, pser) kdf = ks.from_pandas(self.pdf1) pdf = self.pdf1 kdf["c"] = self.kdf2.a pdf["c"] = self.pdf2.a self.assert_eq(kdf.sort_index(), pdf.sort_index()) # Multi-index columns kdf = ks.from_pandas(self.pdf1) pdf = self.pdf1 columns = pd.MultiIndex.from_tuples([("x", "a"), ("x", "b")]) kdf.columns = columns pdf.columns = columns kdf[("y", "c")] = self.kdf2.a pdf[("y", "c")] = self.pdf2.a self.assert_eq(kdf.sort_index(), pdf.sort_index()) pdf = pd.DataFrame({ "a": [1, 2, 3], "Koalas": [0, 1, 2] }).set_index("Koalas", drop=False) kdf = ks.from_pandas(pdf) kdf.index.name = None kdf["NEW"] = ks.Series([100, 200, 300]) pdf.index.name = None pdf["NEW"] = pd.Series([100, 200, 300]) self.assert_eq(kdf.sort_index(), pdf.sort_index())
def test_overwrite_warns(self): mean = ks.Series.mean try: with assert_produces_warning(UserWarning) as w: register_series_accessor("mean")(CustomAccessor) s = ks.Series([1, 2]) assert s.mean.prop == "item" msg = str(w[0].message) assert "mean" in msg assert "CustomAccessor" in msg assert "Series" in msg finally: ks.Series.mean = mean
def _test_expanding_func(self, f): kser = ks.Series([1, 2, 3], index=np.random.rand(3)) pser = kser.to_pandas() self.assert_eq(repr(getattr(kser.expanding(2), f)()), repr(getattr(pser.expanding(2), f)())) # Multiindex kser = ks.Series( [1, 2, 3], index=pd.MultiIndex.from_tuples([('a', 'x'), ('a', 'y'), ('b', 'z')])) pser = kser.to_pandas() self.assert_eq(repr(getattr(kser.expanding(2), f)()), repr(getattr(pser.expanding(2), f)())) kdf = ks.DataFrame({'a': [1, 2, 3, 2], 'b': [4.0, 2.0, 3.0, 1.0]}) pdf = kdf.to_pandas() self.assert_eq(repr(getattr(kdf.expanding(2), f)()), repr(getattr(pdf.expanding(2), f)())) # Multiindex column kdf = ks.DataFrame({'a': [1, 2, 3, 2], 'b': [4.0, 2.0, 3.0, 1.0]}, index=np.random.rand(4)) kdf.columns = pd.MultiIndex.from_tuples([('a', 'x'), ('a', 'y')]) pdf = kdf.to_pandas() self.assert_eq(repr(getattr(kdf.expanding(2), f)()), repr(getattr(pdf.expanding(2), f)()))
def test_dot(self): pser = pd.Series([90, 91, 85], index=[2, 4, 1]) kser = ks.from_pandas(pser) pser_other = pd.Series([90, 91, 85], index=[2, 4, 1]) kser_other = ks.from_pandas(pser_other) self.assert_eq(kser.dot(kser_other), pser.dot(pser_other)) kser_other = ks.Series([90, 91, 85], index=[1, 2, 4]) pser_other = pd.Series([90, 91, 85], index=[1, 2, 4]) self.assert_eq(kser.dot(kser_other), pser.dot(pser_other)) # length of index is different kser_other = ks.Series([90, 91, 85, 100], index=[2, 4, 1, 0]) with self.assertRaisesRegex(ValueError, "matrices are not aligned"): kser.dot(kser_other) # with DataFram is not supported for now since performance issue, # now we raise ValueError with proper message instead. kdf = ks.DataFrame([[0, 1], [-2, 3], [4, -5]], index=[2, 4, 1]) with self.assertRaisesRegex( ValueError, r"Series\.dot\(\) is currently not supported*"): kser.dot(kdf) # for MultiIndex midx = pd.MultiIndex( [["lama", "cow", "falcon"], ["speed", "weight", "length"]], [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], ) pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) kser = ks.from_pandas(pser) pser_other = pd.Series([-450, 20, 12, -30, -250, 15, -320, 100, 3], index=midx) kser_other = ks.from_pandas(pser_other) self.assert_eq(kser.dot(kser_other), pser.dot(pser_other))
def _test_groupby_rolling_func(self, f): kser = ks.Series([1, 2, 3], index=np.random.rand(3)) pser = kser.to_pandas() self.assert_eq( repr(getattr(kser.groupby(kser).rolling(2), f)().sort_index()), repr(getattr(pser.groupby(pser).rolling(2), f)().sort_index()), ) # Multiindex kser = ks.Series( [1, 2, 3], index=pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")]) ) pser = kser.to_pandas() self.assert_eq( repr(getattr(kser.groupby(kser).rolling(2), f)().sort_index()), repr(getattr(pser.groupby(pser).rolling(2), f)()), ) kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]}) pdf = kdf.to_pandas() self.assert_eq( repr(getattr(kdf.groupby(kdf.a).rolling(2), f)().sort_index()), repr(getattr(pdf.groupby(pdf.a).rolling(2), f)().sort_index()), ) # Multiindex column kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]}) kdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")]) pdf = kdf.to_pandas() self.assert_eq( repr(getattr(kdf.groupby(("a", "x")).rolling(2), f)().sort_index()), repr(getattr(pdf.groupby(("a", "x")).rolling(2), f)().sort_index()), ) self.assert_eq( repr(getattr(kdf.groupby([("a", "x"), ("a", "y")]).rolling(2), f)().sort_index()), repr(getattr(pdf.groupby([("a", "x"), ("a", "y")]).rolling(2), f)().sort_index()), )
def test_broadcast(self): kdf = ks.DataFrame( { "key": ["K0", "K1", "K2", "K3"], "A": ["A0", "A1", "A2", "A3"] }, columns=["key", "A"]) self.assert_eq(kdf, ks.broadcast(kdf)) kser = ks.Series([1, 2, 3]) expected_error_message = "Invalid type : expected DataFrame got {}".format( type(kser).__name__) with self.assertRaisesRegex(ValueError, expected_error_message): ks.broadcast(kser)
def test_index(self): # to check setting name of Index properly. idx = pd.Index([1, 2, 3, 4, 5, 6, 7, 8, 9]) kser = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=idx) pser = kser.to_pandas() kser.name = 'koalas' pser.name = 'koalas' self.assert_eq(kser.index.name, pser.index.name) # for check setting names of MultiIndex properly. kser.names = ['hello', 'koalas'] pser.names = ['hello', 'koalas'] self.assert_eq(kser.index.names, pser.index.names)
def test_truncate(self): pser1 = pd.Series([10, 20, 30, 40, 50, 60, 70], index=[1, 2, 3, 4, 5, 6, 7]) kser1 = ks.Series(pser1) pser2 = pd.Series([10, 20, 30, 40, 50, 60, 70], index=[7, 6, 5, 4, 3, 2, 1]) kser2 = ks.Series(pser2) self.assert_eq(kser1.truncate(), pser1.truncate()) self.assert_eq(kser1.truncate(before=2), pser1.truncate(before=2)) self.assert_eq(kser1.truncate(after=5), pser1.truncate(after=5)) self.assert_eq(kser1.truncate(copy=False), pser1.truncate(copy=False)) self.assert_eq(kser1.truncate(2, 5, copy=False), pser1.truncate(2, 5, copy=False)) self.assert_eq(kser2.truncate(4, 6), pser2.truncate(4, 6)) self.assert_eq(kser2.truncate(4, 6, copy=False), pser2.truncate(4, 6, copy=False)) kser = ks.Series([10, 20, 30, 40, 50, 60, 70], index=[1, 2, 3, 4, 3, 2, 1]) msg = "truncate requires a sorted index" with self.assertRaisesRegex(ValueError, msg): kser.truncate() kser = ks.Series([10, 20, 30, 40, 50, 60, 70], index=[1, 2, 3, 4, 5, 6, 7]) msg = "Truncate: 2 must be after 5" with self.assertRaisesRegex(ValueError, msg): kser.truncate(5, 2)
def test_getitem(self): pser = pd.Series([10, 20, 15, 30, 45], ['A', 'A', 'B', 'C', 'D']) kser = ks.Series(pser) self.assert_eq(kser['A'], pser['A']) self.assert_eq(kser['B'], pser['B']) # for MultiIndex midx = pd.MultiIndex([['a', 'b', 'c'], ['lama', 'cow', 'falcon'], ['speed', 'weight', 'length']], [[0, 0, 0, 0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 0, 0, 1, 2, 0, 1, 2]]) pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], name='0', index=midx) kser = ks.Series(pser) self.assert_eq(kser['a'], pser['a']) self.assert_eq(kser['a', 'lama'], pser['a', 'lama']) msg = r"'Key length \(4\) exceeds index depth \(3\)'" with self.assertRaisesRegex(KeyError, msg): kser[('a', 'lama', 'speed', 'x')]
def test_iat(self): pdf = self.pdf kdf = self.kdf # Create the equivalent of pdf.loc[3] as a Koalas Series # This is necessary because .loc[n] does not currently work with Koalas DataFrames (#383) test_series = ks.Series([3, 6], index=["a", "b"], name="3") # Assert invalided signatures raise TypeError with self.assertRaises( TypeError, msg= "Use DataFrame.at like .iat[row_interget_position, column_integer_position]", ): kdf.iat[3] with self.assertRaises( ValueError, msg= "iAt based indexing on multi-index can only have tuple values" ): kdf.iat[3, "b"] # 'ab' is of length 2 but str type instead of tuple with self.assertRaises( TypeError, msg="Use Series.iat like .iat[row_integer_position]"): test_series.iat[3, "b"] # Assert .iat for DataFrames self.assertEqual(kdf.iat[7, 0], 8) self.assertEqual(kdf.iat[7, 0], pdf.iat[7, 0]) # Assert .iat for Series self.assertEqual(test_series.iat[1], 6) self.assertEqual(test_series.iat[1], pdf.loc[3].iat[1]) # Assert invalid column or integer position result in a KeyError like with pandas with self.assertRaises(KeyError, msg=99): kdf.iat[0, 99] with self.assertRaises(KeyError, msg=99): kdf.iat[99, 0] with self.assertRaises(ValueError): kdf.iat[(1, 1), 1] with self.assertRaises(ValueError): kdf.iat[1, (1, 1)] # Assert setting values fails with self.assertRaises(TypeError): kdf.iat[4, 1] = 10
def test_replace(self): pser = pd.Series([10, 20, 15, 30, 45], name='x') kser = ks.Series(pser) self.assert_eq(kser.replace(), pser.replace()) self.assert_eq(kser.replace({}), pser.replace({})) msg = "'to_replace' should be one of str, list, dict, int, float" with self.assertRaisesRegex(ValueError, msg): kser.replace(ks.range(5)) msg = "Replacement lists must match in length. Expecting 3 got 2" with self.assertRaisesRegex(ValueError, msg): kser.replace([10, 20, 30], [1, 2]) msg = "replace currently not support for regex" with self.assertRaisesRegex(NotImplementedError, msg): kser.replace(r'^1.$', regex=True)
def test_pop(self): midx = pd.MultiIndex( [['lama', 'cow', 'falcon'], ['speed', 'weight', 'length']], [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]]) kser = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) pser = kser.to_pandas() self.assert_eq(kser.pop(('lama', 'speed')), pser.pop( ('lama', 'speed'))) msg = "'key' should be string or tuple that contains strings" with self.assertRaisesRegex(ValueError, msg): kser.pop(0) msg = ("'key' should have index names as only strings " "or a tuple that contain index names as only strings") with self.assertRaisesRegex(ValueError, msg): kser.pop(('lama', 0)) msg = r"'Key length \(3\) exceeds index depth \(2\)'" with self.assertRaisesRegex(KeyError, msg): kser.pop(('lama', 'speed', 'x'))
def test_get_dummies(self): for pdf_or_ps in [ pd.Series([1, 1, 1, 2, 2, 1, 3, 4]), # pd.Series([1, 1, 1, 2, 2, 1, 3, 4], dtype='category'), # pd.Series(pd.Categorical([1, 1, 1, 2, 2, 1, 3, 4], # categories=[4, 3, 2, 1])), pd.DataFrame({ "a": [1, 2, 3, 4, 4, 3, 2, 1], # 'b': pd.Categorical(list('abcdabcd')), "b": list("abcdabcd"), }), ]: kdf_or_kser = ks.from_pandas(pdf_or_ps) self.assert_eq(ks.get_dummies(kdf_or_kser), pd.get_dummies(pdf_or_ps, dtype=np.int8)) kser = ks.Series([1, 1, 1, 2, 2, 1, 3, 4]) with self.assertRaisesRegex( NotImplementedError, "get_dummies currently does not support sparse"): ks.get_dummies(kser, sparse=True)
def test_clip(self): pser = pd.Series([0, 2, 4]) kser = ks.from_pandas(pser) # Assert list-like values are not accepted for 'lower' and 'upper' msg = "List-like value are not supported for 'lower' and 'upper' at the moment" with self.assertRaises(ValueError, msg=msg): kser.clip(lower=[1]) with self.assertRaises(ValueError, msg=msg): kser.clip(upper=[1]) # Assert no lower or upper self.assert_eq(kser.clip(), pser.clip()) # Assert lower only self.assert_eq(kser.clip(1), pser.clip(1)) # Assert upper only self.assert_eq(kser.clip(upper=3), pser.clip(upper=3)) # Assert lower and upper self.assert_eq(kser.clip(1, 3), pser.clip(1, 3)) # Assert behavior on string values str_kser = ks.Series(['a', 'b', 'c']) self.assert_eq(str_kser.clip(1, 3), str_kser)
def test_dot(self): pser = pd.Series([90, 91, 85], index=[2, 4, 1]) kser = ks.from_pandas(pser) pser_other = pd.Series([90, 91, 85], index=[2, 4, 1]) kser_other = ks.from_pandas(pser_other) self.assert_eq(kser.dot(kser_other), pser.dot(pser_other)) kser_other = ks.Series([90, 91, 85], index=[1, 2, 4]) pser_other = pd.Series([90, 91, 85], index=[1, 2, 4]) self.assert_eq(kser.dot(kser_other), pser.dot(pser_other)) # length of index is different kser_other = ks.Series([90, 91, 85, 100], index=[2, 4, 1, 0]) with self.assertRaisesRegex(ValueError, "matrices are not aligned"): kser.dot(kser_other) # for MultiIndex midx = pd.MultiIndex( [["lama", "cow", "falcon"], ["speed", "weight", "length"]], [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], ) pser = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) kser = ks.from_pandas(pser) pser_other = pd.Series([-450, 20, 12, -30, -250, 15, -320, 100, 3], index=midx) kser_other = ks.from_pandas(pser_other) self.assert_eq(kser.dot(kser_other), pser.dot(pser_other)) pser = pd.Series([0, 1, 2, 3]) kser = ks.from_pandas(pser) # DataFrame "other" without Index/MultiIndex as columns pdf = pd.DataFrame([[0, 1], [-2, 3], [4, -5], [6, 7]]) kdf = ks.from_pandas(pdf) self.assert_eq(kser.dot(kdf), pser.dot(pdf)) # DataFrame "other" with Index as columns pdf.columns = pd.Index(["x", "y"]) kdf = ks.from_pandas(pdf) self.assert_eq(kser.dot(kdf), pser.dot(pdf)) pdf.columns = pd.Index(["x", "y"], name="cols_name") kdf = ks.from_pandas(pdf) self.assert_eq(kser.dot(kdf), pser.dot(pdf)) pdf = pdf.reindex([1, 0, 2, 3]) kdf = ks.from_pandas(pdf) self.assert_eq(kser.dot(kdf), pser.dot(pdf)) # DataFrame "other" with MultiIndex as columns pdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y")]) kdf = ks.from_pandas(pdf) self.assert_eq(kser.dot(kdf), pser.dot(pdf)) pdf.columns = pd.MultiIndex.from_tuples( [("a", "x"), ("b", "y")], names=["cols_name1", "cols_name2"]) kdf = ks.from_pandas(pdf) self.assert_eq(kser.dot(kdf), pser.dot(pdf)) kser = ks.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}).b pser = kser.to_pandas() kdf = ks.DataFrame({"c": [7, 8, 9]}) pdf = kdf.to_pandas() self.assert_eq(kser.dot(kdf), pser.dot(pdf))
def test_astype(self): pser = pd.Series([10, 20, 15, 30, 45], name='x') kser = ks.Series(pser) with self.assertRaisesRegex(ValueError, 'Type int63 not understood'): kser.astype('int63')
def test_median(self): with self.assertRaisesRegex(ValueError, "accuracy must be an integer; however"): ks.Series([24., 21., 25., 33., 26.]).median(accuracy="a")
def test_nlargest(self): sample_lst = [1, 2, 3, 4, np.nan, 6] ps = pd.Series(sample_lst, name='x') ks = koalas.Series(sample_lst, name='x') self.assert_eq(ks.nlargest(n=3), ps.nlargest(n=3)) self.assert_eq(ks.nlargest(), ps.nlargest())
def test_groupby_expanding_count(self): # The behaviour of ExpandingGroupby.count are different between pandas>=1.0.0 and lower, # and we're following the behaviour of latest version of pandas. if LooseVersion(pd.__version__) >= LooseVersion("1.0.0"): self._test_groupby_expanding_func("count") else: # Series kser = ks.Series([1, 2, 3], index=np.random.rand(3)) midx = pd.MultiIndex.from_tuples( list( zip(kser.to_pandas().values, kser.index.to_pandas().values))) expected_result = ks.Series([np.nan, np.nan, np.nan], index=midx) self.assert_eq( kser.groupby(kser).expanding(2).count().sort_index(), expected_result.sort_index(), almost=True, ) # MultiIndex kser = ks.Series([1, 2, 3], index=pd.MultiIndex.from_tuples([("a", "x"), ("a", "y"), ("b", "z")])) midx = pd.MultiIndex.from_tuples([(1, "a", "x"), (2, "a", "y"), (3, "b", "z")]) expected_result = ks.Series([np.nan, np.nan, np.nan], index=midx) self.assert_eq( kser.groupby(kser).expanding(2).count().sort_index(), expected_result.sort_index(), almost=True, ) # DataFrame kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]}) midx = pd.MultiIndex.from_tuples([(1, 0), (2, 1), (2, 3), (3, 2)]) expected_result = ks.DataFrame( { "a": [None, None, 2.0, None], "b": [None, None, 2.0, None] }, index=midx) self.assert_eq( kdf.groupby(kdf.a).expanding(2).count().sort_index(), expected_result.sort_index(), almost=True, ) # MultiIndex column kdf = ks.DataFrame({"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0]}) kdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")]) midx = pd.MultiIndex.from_tuples([(1, 0), (2, 1), (2, 3), (3, 2)]) expected_result = ks.DataFrame( { "a": [None, None, 2.0, None], "b": [None, None, 2.0, None] }, index=midx) expected_result.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")]) self.assert_eq( kdf.groupby(("a", "x")).expanding(2).count().sort_index(), expected_result.sort_index(), almost=True, ) midx = pd.MultiIndex.from_tuples([(1, 4.0, 0), (2, 1.0, 3), (2, 2.0, 1), (3, 3.0, 2)]) expected_result = ks.DataFrame( { "a": [np.nan, np.nan, np.nan, np.nan], "b": [np.nan, np.nan, np.nan, np.nan] }, index=midx, ) expected_result.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")]) self.assert_eq( kdf.groupby([("a", "x"), ("a", "y")]).expanding(2).count().sort_index(), expected_result.sort_index(), almost=True, )
df["prediction"] = model.predict(df) stop = datetime.now() print("Temps préparation et inférence (ML) : ", (stop - start).seconds, "s") # %% ##### 7e changement : Il faut donc recalculer le score nous même from databricks.koalas.config import set_option, reset_option set_option("compute.ops_on_diff_frames", True) # Score : The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum() reel = ks.Series(y_test).to_frame().rename(columns={0: 'Reel'}) result = ks.concat([df, reel], axis=1) result['square_diff_true_pred'] = (result['Reel'] - result['prediction'])**2 u = result['square_diff_true_pred'].sum() v = ((result['Reel'] - result['Reel'].mean())**2).sum() score = (1 - u / v) print(f"score: {score}") # %% [markdown] # ## Entrainement et inférence avec Pipeline # %% [markdown] # Seuls les modèles entrainés et les prédictions peuvent être utilisés avec koalas # %%
def test_unsupported_type(self): self.assertRaisesRegex( ValueError, "Cannot call DatetimeMethods on type LongType", lambda: ks.Series([0]).dt )