def test_to_series_comparison(self): kidx1 = ks.Index([1, 2, 3, 4, 5]) kidx2 = ks.Index([1, 2, 3, 4, 5]) self.assert_eq((kidx1.to_series() == kidx2.to_series()).all(), True) kidx1.name = "koalas" kidx2.name = "koalas" self.assert_eq((kidx1.to_series() == kidx2.to_series()).all(), True)
def test_sort_values(self): pidx = pd.Index([-10, -100, 200, 100]) kidx = ks.Index([-10, -100, 200, 100]) self.assert_eq(pidx.sort_values(), kidx.sort_values()) self.assert_eq(pidx.sort_values(ascending=False), kidx.sort_values(ascending=False)) pidx.name = "koalas" kidx.name = "koalas" self.assert_eq(pidx.sort_values(), kidx.sort_values()) self.assert_eq(pidx.sort_values(ascending=False), kidx.sort_values(ascending=False)) pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) kidx = ks.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) pidx.names = ["hello", "koalas", "goodbye"] kidx.names = ["hello", "koalas", "goodbye"] self.assert_eq(pidx.sort_values(), kidx.sort_values()) self.assert_eq(pidx.sort_values(ascending=False), kidx.sort_values(ascending=False))
def test_sort_values(self): pidx = pd.Index([-10, -100, 200, 100]) kidx = ks.Index([-10, -100, 200, 100]) self.assert_eq(pidx.sort_values(), kidx.sort_values()) self.assert_eq(pidx.sort_values(ascending=False), kidx.sort_values(ascending=False)) pidx.name = 'koalas' kidx.name = 'koalas' self.assert_eq(pidx.sort_values(), kidx.sort_values()) self.assert_eq(pidx.sort_values(ascending=False), kidx.sort_values(ascending=False)) pidx = pd.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)]) kidx = ks.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)]) pidx.names = ['hello', 'koalas', 'goodbye'] kidx.names = ['hello', 'koalas', 'goodbye'] self.assert_eq(pidx.sort_values(), kidx.sort_values()) self.assert_eq(pidx.sort_values(ascending=False), kidx.sort_values(ascending=False))
def test_series_iloc_setitem(self): pser = pd.Series([1, 2, 3], index=["cobra", "viper", "sidewinder"]) kser = ks.from_pandas(pser) piloc = pser.iloc kiloc = kser.iloc pser1 = pser + 1 kser1 = kser + 1 for key, value in [ ([1, 2], 10), (1, 50), (slice(None), 10), (slice(None, 1), 20), (slice(1, None), 30), ]: with self.subTest(key=key, value=value): pser.iloc[key] = value kser.iloc[key] = value self.assert_eq(kser, pser) piloc[key] = -value kiloc[key] = -value self.assert_eq(kser, pser) pser1.iloc[key] = value kser1.iloc[key] = value self.assert_eq(kser1, pser1) with self.assertRaises(ValueError): kser.iloc[1] = -kser pser = pd.Index([1, 2, 3]).to_series() kser = ks.Index([1, 2, 3]).to_series() pser1 = pser + 1 kser1 = kser + 1 pser.iloc[0] = 10 kser.iloc[0] = 10 self.assert_eq(kser, pser) pser1.iloc[0] = 20 kser1.iloc[0] = 20 self.assert_eq(kser1, pser1) pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) kdf = ks.from_pandas(pdf) pser = pdf.a kser = kdf.a pser.iloc[[1]] = -pdf.b kser.iloc[[1]] = -kdf.b self.assert_eq(kser, pser) with self.assertRaisesRegex(ValueError, "Incompatible indexer with DataFrame"): kser.iloc[1] = kdf[["b"]]
def test_index_drop_duplicates(self): pidx = pd.Index([1, 1, 2]) kidx = ks.Index([1, 1, 2]) self.assert_eq(pidx.drop_duplicates(), kidx.drop_duplicates()) pidx = pd.MultiIndex.from_tuples([(1, 1), (1, 1), (2, 2)], names=['level1', 'level2']) kidx = ks.MultiIndex.from_tuples([(1, 1), (1, 1), (2, 2)], names=['level1', 'level2']) self.assert_eq(pidx.drop_duplicates(), kidx.drop_duplicates())
def test_len(self): pidx = pd.Index(range(10000)) kidx = ks.Index(range(10000)) self.assert_eq(len(pidx), len(kidx)) pidx = pd.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)]) kidx = ks.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)]) self.assert_eq(len(pidx), len(kidx))
def test_len(self): pidx = pd.Index(range(10000)) kidx = ks.Index(range(10000)) self.assert_eq(len(pidx), len(kidx)) pidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) kidx = ks.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) self.assert_eq(len(pidx), len(kidx))
def test_multi_index_symmetric_difference(self): idx = ks.Index(['a', 'b', 'c']) midx = ks.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')]) midx_ = ks.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')]) self.assert_eq( midx.symmetric_difference(midx_), midx.to_pandas().symmetric_difference(midx_.to_pandas())) with self.assertRaisesRegexp(NotImplementedError, "Doesn't support*"): midx.symmetric_difference(idx)
def indexer_between_time( self, start_time: Union[datetime.time, str], end_time: Union[datetime.time, str], include_start: bool = True, include_end: bool = True, ) -> Index: """ Return index locations of values between particular times of day (e.g., 9:00-9:30AM). Parameters ---------- start_time, end_time : datetime.time, str Time passed either as object (datetime.time) or as string in appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p","%I%M%S%p"). include_start : bool, default True include_end : bool, default True Returns ------- values_between_time : Index of integers Examples -------- >>> kidx = ks.date_range("2000-01-01", periods=3, freq="T") >>> kidx # doctest: +NORMALIZE_WHITESPACE DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:01:00', '2000-01-01 00:02:00'], dtype='datetime64[ns]', freq=None) >>> kidx.indexer_between_time("00:01", "00:02").sort_values() Int64Index([1, 2], dtype='int64') >>> kidx.indexer_between_time("00:01", "00:02", include_end=False) Int64Index([1], dtype='int64') >>> kidx.indexer_between_time("00:01", "00:02", include_start=False) Int64Index([2], dtype='int64') """ def pandas_between_time(pdf) -> ks.DataFrame[int]: return pdf.between_time(start_time, end_time, include_start, include_end) kdf = self.to_frame()[[]] id_column_name = verify_temp_column_name(kdf, "__id_column__") kdf = kdf.koalas.attach_id_column("distributed-sequence", id_column_name) with ks.option_context("compute.default_index_type", "distributed"): # The attached index in the statement below will be dropped soon, # so we enforce “distributed” default index type kdf = kdf.koalas.apply_batch(pandas_between_time) return ks.Index(first_series(kdf).rename(self.name))
def test_argmax(self): pidx = pd.Index([100, 50, 10, 20, 30, 60, 0, 50, 0, 100, 100, 100, 20, 0, 0]) kidx = ks.Index([100, 50, 10, 20, 30, 60, 0, 50, 0, 100, 100, 100, 20, 0, 0]) self.assert_eq(pidx.argmax(), kidx.argmax()) # MultiIndex kidx = ks.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) with self.assertRaisesRegex( TypeError, "reduction operation 'argmax' not allowed for this dtype" ): kidx.argmax()
def test_index_sort(self): idx = ks.Index([1, 2, 3, 4, 5]) midx = ks.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2)]) with self.assertRaisesRegex( TypeError, "cannot sort an Index object in-place, use sort_values instead"): idx.sort() with self.assertRaisesRegex( TypeError, "cannot sort an Index object in-place, use sort_values instead"): midx.sort()
def test_multi_index_symmetric_difference(self): idx = ks.Index(["a", "b", "c"]) midx = ks.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) midx_ = ks.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) self.assert_eq( midx.symmetric_difference(midx_), midx.to_pandas().symmetric_difference(midx_.to_pandas()), ) with self.assertRaisesRegexp(NotImplementedError, "Doesn't support*"): midx.symmetric_difference(idx)
def test_argmin(self): pidx = pd.Index( [100, 50, 10, 20, 30, 60, 0, 50, 0, 100, 100, 100, 20, 0, 0]) kidx = ks.Index( [100, 50, 10, 20, 30, 60, 0, 50, 0, 100, 100, 100, 20, 0, 0]) self.assert_eq(pidx.argmin(), kidx.argmin()) # MultiIndex kidx = ks.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)]) with self.assertRaisesRegex( TypeError, "reduction operation 'argmin' not allowed for this dtype"): kidx.argmin()
def test_categorical_index(self): pidx = pd.CategoricalIndex([1, 2, 3]) kidx = ks.CategoricalIndex([1, 2, 3]) self.assert_eq(kidx, pidx) self.assert_eq(kidx.categories, pidx.categories) self.assert_eq(kidx.codes, pd.Index(pidx.codes)) self.assert_eq(kidx.ordered, pidx.ordered) pidx = pd.Index([1, 2, 3], dtype="category") kidx = ks.Index([1, 2, 3], dtype="category") self.assert_eq(kidx, pidx) self.assert_eq(kidx.categories, pidx.categories) self.assert_eq(kidx.codes, pd.Index(pidx.codes)) self.assert_eq(kidx.ordered, pidx.ordered) pdf = pd.DataFrame( { "a": pd.Categorical([1, 2, 3, 1, 2, 3]), "b": pd.Categorical(["a", "b", "c", "a", "b", "c"], categories=["c", "b", "a"]), }, index=pd.Categorical([10, 20, 30, 20, 30, 10], categories=[30, 10, 20], ordered=True), ) kdf = ks.from_pandas(pdf) pidx = pdf.set_index("b").index kidx = kdf.set_index("b").index self.assert_eq(kidx, pidx) self.assert_eq(kidx.categories, pidx.categories) self.assert_eq(kidx.codes, pd.Index(pidx.codes)) self.assert_eq(kidx.ordered, pidx.ordered) pidx = pdf.set_index(["a", "b"]).index.get_level_values(0) kidx = kdf.set_index(["a", "b"]).index.get_level_values(0) self.assert_eq(kidx, pidx) self.assert_eq(kidx.categories, pidx.categories) self.assert_eq(kidx.codes, pd.Index(pidx.codes)) self.assert_eq(kidx.ordered, pidx.ordered)
def test_arithmetic_op_exceptions(self): kser = self.ks_start_date py_datetime = self.pd_start_date.dt.to_pydatetime() datetime_index = ks.Index(self.pd_start_date) for other in [1, 0.1, kser, datetime_index, py_datetime]: expected_err_msg = "addition can not be applied to date times." self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kser + other) self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other + kser) expected_err_msg = "multiplication can not be applied to date times." self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kser * other) self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other * kser) expected_err_msg = "division can not be applied to date times." self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kser / other) self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other / kser) self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kser // other) self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other // kser) expected_err_msg = "modulo can not be applied to date times." self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kser % other) self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other % kser) expected_err_msg = "datetime subtraction can only be applied to datetime series." for other in [1, 0.1]: self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kser - other) self.assertRaisesRegex(TypeError, expected_err_msg, lambda: other - kser) self.assertRaisesRegex(TypeError, expected_err_msg, lambda: kser - other) self.assertRaises(NotImplementedError, lambda: py_datetime - kser)
def indexer_at_time(self, time: Union[datetime.time, str], asof: bool = False) -> Index: """ Return index locations of values at particular time of day (e.g. 9:30AM). Parameters ---------- time : datetime.time or str Time passed in either as object (datetime.time) or as string in appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", "%I%M%S%p"). Returns ------- values_at_time : Index of integers Examples -------- >>> kidx = ks.date_range("2000-01-01", periods=3, freq="T") >>> kidx # doctest: +NORMALIZE_WHITESPACE DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:01:00', '2000-01-01 00:02:00'], dtype='datetime64[ns]', freq=None) >>> kidx.indexer_at_time("00:00") Int64Index([0], dtype='int64') >>> kidx.indexer_at_time("00:01") Int64Index([1], dtype='int64') """ if asof: raise NotImplementedError("'asof' argument is not supported") def pandas_at_time(pdf) -> ks.DataFrame[int]: return pdf.at_time(time, asof) kdf = self.to_frame()[[]] id_column_name = verify_temp_column_name(kdf, "__id_column__") kdf = kdf.koalas.attach_id_column("distributed-sequence", id_column_name) with ks.option_context("compute.default_index_type", "distributed"): # The attached index in the statement below will be dropped soon, # so we enforce “distributed” default index type kdf = kdf.koalas.apply_batch(pandas_at_time) return ks.Index(first_series(kdf).rename(self.name))
def test_series_iloc_setitem(self): pser = pd.Series([1, 2, 3], index=["cobra", "viper", "sidewinder"]) kser = ks.from_pandas(pser) pser1 = pser + 1 kser1 = kser + 1 for key, value in [ ([1, 2], 10), (1, 50), (slice(None), 10), (slice(None, 1), 20), (slice(1, None), 30), ]: with self.subTest(key=key, value=value): pser.iloc[key] = value kser.iloc[key] = value self.assert_eq(kser, pser) pser1.iloc[key] = value kser1.iloc[key] = value self.assert_eq(kser1, pser1) with self.assertRaises(ValueError): kser.iloc[1] = -kser pser = pd.Index([1, 2, 3]).to_series() kser = ks.Index([1, 2, 3]).to_series() pser1 = pser + 1 kser1 = kser + 1 pser.iloc[0] = 10 kser.iloc[0] = 10 self.assert_eq(kser, pser) pser1.iloc[0] = 20 kser1.iloc[0] = 20 self.assert_eq(kser1, pser1)
def test_index_symmetric_difference(self): pidx1 = pd.Index([1, 2, 3, 4]) pidx2 = pd.Index([2, 3, 4, 5]) kidx1 = ks.from_pandas(pidx1) kidx2 = ks.from_pandas(pidx2) self.assert_eq( kidx1.symmetric_difference(kidx2).sort_values(), pidx1.symmetric_difference(pidx2).sort_values(), ) self.assert_eq( (kidx1 + 1).symmetric_difference(kidx2).sort_values(), (pidx1 + 1).symmetric_difference(pidx2).sort_values(), ) pmidx1 = pd.MultiIndex( [["lama", "cow", "falcon"], ["speed", "weight", "length"]], [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 0, 0, 1, 2, 0, 1, 2]], ) pmidx2 = pd.MultiIndex( [["koalas", "cow", "falcon"], ["speed", "weight", "length"]], [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 0, 0, 1, 2, 0, 1, 2]], ) kmidx1 = ks.from_pandas(pmidx1) kmidx2 = ks.from_pandas(pmidx2) self.assert_eq( kmidx1.symmetric_difference(kmidx2).sort_values(), pmidx1.symmetric_difference(pmidx2).sort_values(), ) idx = ks.Index(["a", "b", "c"]) midx = ks.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) with self.assertRaisesRegexp(NotImplementedError, "Doesn't support*"): idx.symmetric_difference(midx)
def test_index_nunique(self): pidx = pd.Index([1, 1, 2, None]) kidx = ks.Index([1, 1, 2, None]) self.assert_eq(pidx.nunique(), kidx.nunique()) self.assert_eq(pidx.nunique(dropna=True), kidx.nunique(dropna=True))
def test_index_symmetric_difference(self): idx = ks.Index(["a", "b", "c"]) midx = ks.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) with self.assertRaisesRegexp(NotImplementedError, "Doesn't support*"): idx.symmetric_difference(midx)
def test_difference(self): # Index kidx1 = ks.Index([1, 2, 3, 4], name="koalas") kidx2 = ks.Index([3, 4, 5, 6], name="koalas") pidx1 = kidx1.to_pandas() pidx2 = kidx2.to_pandas() self.assert_eq( kidx1.difference(kidx2).sort_values(), pidx1.difference(pidx2).sort_values()) self.assert_eq( kidx1.difference([3, 4, 5, 6]).sort_values(), pidx1.difference([3, 4, 5, 6]).sort_values(), ) self.assert_eq( kidx1.difference((3, 4, 5, 6)).sort_values(), pidx1.difference((3, 4, 5, 6)).sort_values(), ) self.assert_eq( kidx1.difference({3, 4, 5, 6}).sort_values(), pidx1.difference({3, 4, 5, 6}).sort_values(), ) self.assert_eq( kidx1.difference({ 3: 1, 4: 2, 5: 3, 6: 4 }).sort_values(), pidx1.difference({ 3: 1, 4: 2, 5: 3, 6: 4 }).sort_values(), ) # Exceptions for Index with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"): kidx1.difference("1234") with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"): kidx1.difference(1234) with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"): kidx1.difference(12.34) with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"): kidx1.difference(None) with self.assertRaisesRegex(TypeError, "Input must be Index or array-like"): kidx1.difference(np.nan) with self.assertRaisesRegex( ValueError, "The 'sort' keyword only takes the values of None or True; 1 was passed." ): kidx1.difference(kidx2, sort=1) # MultiIndex kidx1 = ks.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)], names=["hello", "koalas", "world"]) kidx2 = ks.MultiIndex.from_tuples([("a", "x", 1), ("b", "z", 2), ("k", "z", 3)], names=["hello", "koalas", "world"]) pidx1 = kidx1.to_pandas() pidx2 = kidx2.to_pandas() self.assert_eq( kidx1.difference(kidx2).sort_values(), pidx1.difference(pidx2).sort_values()) self.assert_eq( kidx1.difference({("a", "x", 1)}).sort_values(), pidx1.difference({("a", "x", 1)}).sort_values(), ) self.assert_eq( kidx1.difference({ ("a", "x", 1): [1, 2, 3] }).sort_values(), pidx1.difference({ ("a", "x", 1): [1, 2, 3] }).sort_values(), ) # Exceptions for MultiIndex with self.assertRaisesRegex( TypeError, "other must be a MultiIndex or a list of tuples"): kidx1.difference(["b", "z", "2"])
def test_append(self): # Index pidx = pd.Index(range(10000)) kidx = ks.Index(range(10000)) self.assert_eq(pidx.append(pidx), kidx.append(kidx)) # Index with name pidx1 = pd.Index(range(10000), name="a") pidx2 = pd.Index(range(10000), name="b") kidx1 = ks.Index(range(10000), name="a") kidx2 = ks.Index(range(10000), name="b") self.assert_eq(pidx1.append(pidx2), kidx1.append(kidx2)) self.assert_eq(pidx2.append(pidx1), kidx2.append(kidx1)) # Index from DataFrame pdf1 = pd.DataFrame({ "a": [1, 2, 3], "b": [4, 5, 6] }, index=["a", "b", "c"]) pdf2 = pd.DataFrame({ "a": [7, 8, 9], "d": [10, 11, 12] }, index=["x", "y", "z"]) kdf1 = ks.from_pandas(pdf1) kdf2 = ks.from_pandas(pdf2) pidx1 = pdf1.set_index("a").index pidx2 = pdf2.set_index("d").index kidx1 = kdf1.set_index("a").index kidx2 = kdf2.set_index("d").index self.assert_eq(pidx1.append(pidx2), kidx1.append(kidx2)) self.assert_eq(pidx2.append(pidx1), kidx2.append(kidx1)) # Index from DataFrame with MultiIndex columns pdf1 = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) pdf2 = pd.DataFrame({"a": [7, 8, 9], "d": [10, 11, 12]}) pdf1.columns = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y")]) pdf2.columns = pd.MultiIndex.from_tuples([("a", "x"), ("d", "y")]) kdf1 = ks.from_pandas(pdf1) kdf2 = ks.from_pandas(pdf2) pidx1 = pdf1.set_index(("a", "x")).index pidx2 = pdf2.set_index(("d", "y")).index kidx1 = kdf1.set_index(("a", "x")).index kidx2 = kdf2.set_index(("d", "y")).index self.assert_eq(pidx1.append(pidx2), kidx1.append(kidx2)) self.assert_eq(pidx2.append(pidx1), kidx2.append(kidx1)) # MultiIndex pmidx = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) kmidx = ks.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)]) self.assert_eq(pmidx.append(pmidx), kmidx.append(kmidx)) # MultiIndex with names pmidx1 = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)], names=["x", "y", "z"]) pmidx2 = pd.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)], names=["p", "q", "r"]) kmidx1 = ks.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)], names=["x", "y", "z"]) kmidx2 = ks.MultiIndex.from_tuples([("a", "x", 1), ("b", "y", 2), ("c", "z", 3)], names=["p", "q", "r"]) self.assert_eq(pmidx1.append(pmidx2), kmidx1.append(kmidx2)) self.assert_eq(pmidx2.append(pmidx1), kmidx2.append(kmidx1)) self.assert_eq( pmidx1.append(pmidx2).names, kmidx1.append(kmidx2).names) self.assert_eq( pmidx1.append(pmidx2).names, kmidx1.append(kmidx2).names) # Index & MultiIndex currently is not supported expected_error_message = r"append\(\) between Index & MultiIndex currently is not supported" with self.assertRaisesRegex(NotImplementedError, expected_error_message): kidx.append(kmidx) with self.assertRaisesRegex(NotImplementedError, expected_error_message): kmidx.append(kidx)
def test_append(self): # Index pidx = pd.Index(range(10000)) kidx = ks.Index(range(10000)) self.assert_eq(pidx.append(pidx), kidx.append(kidx)) # Index with name pidx1 = pd.Index(range(10000), name='a') pidx2 = pd.Index(range(10000), name='b') kidx1 = ks.Index(range(10000), name='a') kidx2 = ks.Index(range(10000), name='b') self.assert_eq(pidx1.append(pidx2), kidx1.append(kidx2)) self.assert_eq(pidx2.append(pidx1), kidx2.append(kidx1)) # Index from DataFrame pdf1 = pd.DataFrame({ 'a': [1, 2, 3], 'b': [4, 5, 6] }, index=['a', 'b', 'c']) pdf2 = pd.DataFrame({ 'a': [7, 8, 9], 'd': [10, 11, 12] }, index=['x', 'y', 'z']) kdf1 = ks.from_pandas(pdf1) kdf2 = ks.from_pandas(pdf2) pidx1 = pdf1.set_index('a').index pidx2 = pdf2.set_index('d').index kidx1 = kdf1.set_index('a').index kidx2 = kdf2.set_index('d').index self.assert_eq(pidx1.append(pidx2), kidx1.append(kidx2)) self.assert_eq(pidx2.append(pidx1), kidx2.append(kidx1)) # Index from DataFrame with MultiIndex columns pdf1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) pdf2 = pd.DataFrame({'a': [7, 8, 9], 'd': [10, 11, 12]}) pdf1.columns = pd.MultiIndex.from_tuples([('a', 'x'), ('b', 'y')]) pdf2.columns = pd.MultiIndex.from_tuples([('a', 'x'), ('d', 'y')]) kdf1 = ks.from_pandas(pdf1) kdf2 = ks.from_pandas(pdf2) pidx1 = pdf1.set_index(('a', 'x')).index pidx2 = pdf2.set_index(('d', 'y')).index kidx1 = kdf1.set_index(('a', 'x')).index kidx2 = kdf2.set_index(('d', 'y')).index self.assert_eq(pidx1.append(pidx2), kidx1.append(kidx2)) self.assert_eq(pidx2.append(pidx1), kidx2.append(kidx1)) # MultiIndex pmidx = pd.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)]) kmidx = ks.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)]) self.assert_eq(pmidx.append(pmidx), kmidx.append(kmidx)) # MultiIndex with names pmidx1 = pd.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)], names=['x', 'y', 'z']) pmidx2 = pd.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)], names=['p', 'q', 'r']) kmidx1 = ks.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)], names=['x', 'y', 'z']) kmidx2 = ks.MultiIndex.from_tuples([('a', 'x', 1), ('b', 'y', 2), ('c', 'z', 3)], names=['p', 'q', 'r']) self.assert_eq(pmidx1.append(pmidx2), kmidx1.append(kmidx2)) self.assert_eq(pmidx2.append(pmidx1), kmidx2.append(kmidx1)) self.assert_eq( pmidx1.append(pmidx2).names, kmidx1.append(kmidx2).names) self.assert_eq( pmidx1.append(pmidx2).names, kmidx1.append(kmidx2).names) # Index & MultiIndex currently is not supported expected_error_message = r"append\(\) between Index & MultiIndex currently is not supported" with self.assertRaisesRegex(NotImplementedError, expected_error_message): kidx.append(kmidx) with self.assertRaisesRegex(NotImplementedError, expected_error_message): kmidx.append(kidx)