def test_corr(self): # Disable arrow execution since corr() is using UDT internally which is not supported. with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}): # DataFrame # we do not handle NaNs for now pdf = makeMissingDataframe(0.3, 42).fillna(0) psdf = ps.from_pandas(pdf) self.assert_eq(psdf.corr(), pdf.corr(), check_exact=False) # Series pser_a = pdf.A pser_b = pdf.B psser_a = psdf.A psser_b = psdf.B self.assertAlmostEqual(psser_a.corr(psser_b), pser_a.corr(pser_b)) self.assertRaises(TypeError, lambda: psser_a.corr(psdf)) # multi-index columns columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B"), ("Y", "C"), ("Z", "D")]) pdf.columns = columns psdf.columns = columns self.assert_eq(psdf.corr(), pdf.corr(), check_exact=False) # Series pser_xa = pdf[("X", "A")] pser_xb = pdf[("X", "B")] psser_xa = psdf[("X", "A")] psser_xb = psdf[("X", "B")] self.assert_eq(psser_xa.corr(psser_xb), pser_xa.corr(pser_xb), almost=True)
Series([1.0, 1.5, 3.2]), Series([1.0, 1.5, np.nan]), Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), Series(["a", "b", "c"]), Series(["a", np.nan, "c"]), Series(["a", None, "c"]), Series([True, False, True]), Series(dtype=object), Index([1, 2, 3]), Index([True, False, True]), DataFrame({ "x": ["a", "b", "c"], "y": [1, 2, 3] }), DataFrame(), tm.makeMissingDataframe(), tm.makeMixedDataFrame(), tm.makeTimeDataFrame(), tm.makeTimeSeries(), tm.makeTimedeltaIndex(), tm.makePeriodIndex(), Series(tm.makePeriodIndex()), Series(pd.date_range("20130101", periods=3, tz="US/Eastern")), MultiIndex.from_product([ range(5), ["foo", "bar", "baz"], pd.date_range("20130101", periods=2) ]), MultiIndex.from_product([pd.CategoricalIndex(list("aabc")), range(3)]), ], )