def test_fillna_datetime_columns(): frame_data = { "A": [-1, -2, np.nan], "B": pd.date_range("20130101", periods=3), "C": ["foo", "bar", None], "D": ["foo2", "bar2", None], } df = pandas.DataFrame(frame_data, index=pd.date_range("20130110", periods=3)) modin_df = pd.DataFrame(frame_data, index=pd.date_range("20130110", periods=3)) df_equals(modin_df.fillna("?"), df.fillna("?")) frame_data = { "A": [-1, -2, np.nan], "B": [ pandas.Timestamp("2013-01-01"), pandas.Timestamp("2013-01-02"), pandas.NaT, ], "C": ["foo", "bar", None], "D": ["foo2", "bar2", None], } df = pandas.DataFrame(frame_data, index=pd.date_range("20130110", periods=3)) modin_df = pd.DataFrame(frame_data, index=pd.date_range("20130110", periods=3)) df_equals(modin_df.fillna("?"), df.fillna("?"))
def test_tz_convert(): modin_idx = pd.date_range("1/1/2012", periods=500, freq="2D", tz="America/Los_Angeles") pandas_idx = pandas.date_range("1/1/2012", periods=500, freq="2D", tz="America/Los_Angeles") data = np.random.randint(0, 100, size=(len(modin_idx), 4)) modin_df = pd.DataFrame(data, index=modin_idx) pandas_df = pandas.DataFrame(data, index=pandas_idx) modin_result = modin_df.tz_convert("UTC", axis=0) pandas_result = pandas_df.tz_convert("UTC", axis=0) df_equals(modin_result, pandas_result) modin_multi = pd.MultiIndex.from_arrays([modin_idx, range(len(modin_idx))]) pandas_multi = pandas.MultiIndex.from_arrays( [pandas_idx, range(len(modin_idx))]) modin_series = pd.DataFrame(data, index=modin_multi) pandas_series = pandas.DataFrame(data, index=pandas_multi) df_equals( modin_series.tz_convert("UTC", axis=0, level=0), pandas_series.tz_convert("UTC", axis=0, level=0), )
def test_asfreq(): index = pd.date_range("1/1/2000", periods=4, freq="T") series = pd.Series([0.0, None, 2.0, 3.0], index=index) df = pd.DataFrame({"s": series}) with pytest.warns(UserWarning): # We are only testing that this defaults to pandas, so we will just check for # the warning df.asfreq(freq="30S")
def test_getitem_datetime_slice(): data = {"data": range(1000)} index = pd.date_range("2017/1/4", periods=1000) modin_df = pd.DataFrame(data=data, index=index) pandas_df = pandas.DataFrame(data=data, index=index) s = slice("2017-01-06", "2017-01-09") df_equals(modin_df[s], pandas_df[s])
def test_first(): i = pd.date_range("2010-04-09", periods=400, freq="2D") modin_df = pd.DataFrame({"A": list(range(400)), "B": list(range(400))}, index=i) pandas_df = pandas.DataFrame( {"A": list(range(400)), "B": list(range(400))}, index=i ) df_equals(modin_df.first("3D"), pandas_df.first("3D")) df_equals(modin_df.first("20D"), pandas_df.first("20D"))
def test_at_time(): i = pd.date_range("2008-01-01", periods=1000, freq="12H") modin_df = pd.DataFrame({"A": list(range(1000)), "B": list(range(1000))}, index=i) pandas_df = pandas.DataFrame( {"A": list(range(1000)), "B": list(range(1000))}, index=i ) df_equals(modin_df.at_time("12:00"), pandas_df.at_time("12:00")) df_equals(modin_df.at_time("3:00"), pandas_df.at_time("3:00")) df_equals(modin_df.T.at_time("12:00", axis=1), pandas_df.T.at_time("12:00", axis=1))
def test_tz_localize(): idx = pd.date_range("1/1/2012", periods=400, freq="2D") data = np.random.randint(0, 100, size=(len(idx), 4)) modin_df = pd.DataFrame(data, index=idx) pandas_df = pandas.DataFrame(data, index=idx) df_equals(modin_df.tz_localize("UTC", axis=0), pandas_df.tz_localize("UTC", axis=0)) df_equals( modin_df.tz_localize("America/Los_Angeles", axis=0), pandas_df.tz_localize("America/Los_Angeles", axis=0), )
def test_dataframe_dt_index(axis, on, closed, window): index = pandas.date_range("31/12/2000", periods=12, freq="T") data = {"A": range(12), "B": range(12)} pandas_df = pandas.DataFrame(data, index=index) modin_df = pd.DataFrame(data, index=index) if on is not None and axis == 0 and isinstance(window, str): pandas_df[on] = pandas.date_range("22/06/1941", periods=12, freq="T") modin_df[on] = pd.date_range("22/06/1941", periods=12, freq="T") else: on = None if axis == "columns": pandas_df = pandas_df.T modin_df = modin_df.T pandas_rolled = pandas_df.rolling(window=window, on=on, axis=axis, closed=closed) modin_rolled = modin_df.rolling(window=window, on=on, axis=axis, closed=closed) if isinstance(window, int): # This functions are very slowly for data from test_rolling df_equals(modin_rolled.corr(modin_df, True), pandas_rolled.corr(pandas_df, True)) df_equals(modin_rolled.corr(modin_df, False), pandas_rolled.corr(pandas_df, False)) df_equals(modin_rolled.cov(modin_df, True), pandas_rolled.cov(pandas_df, True)) df_equals(modin_rolled.cov(modin_df, False), pandas_rolled.cov(pandas_df, False)) if axis == 0: df_equals( modin_rolled.cov(modin_df[modin_df.columns[0]], True), pandas_rolled.cov(pandas_df[pandas_df.columns[0]], True), ) df_equals( modin_rolled.corr(modin_df[modin_df.columns[0]], True), pandas_rolled.corr(pandas_df[pandas_df.columns[0]], True), ) else: df_equals(modin_rolled.skew(), pandas_rolled.skew()) df_equals( modin_rolled.apply(np.sum, raw=True), pandas_rolled.apply(np.sum, raw=True), ) df_equals(modin_rolled.aggregate(np.sum), pandas_rolled.aggregate(np.sum)) df_equals(modin_rolled.quantile(0.1), pandas_rolled.quantile(0.1)) # `Rolling.counts` has a buggy side-effect on other rolling functions described in: # https://github.com/pandas-dev/pandas/issues/39554 # So the testing of `.count` should always be the last until this bug # will be fixed in pandas, to avoid this side-effect df_equals(modin_rolled.count(), pandas_rolled.count())
def test_dataframe_dt_index(axis, on, closed, window): index = pandas.date_range("31/12/2000", periods=12, freq="T") data = {"A": range(12), "B": range(12)} pandas_df = pandas.DataFrame(data, index=index) modin_df = pd.DataFrame(data, index=index) if on is not None and axis == 0 and isinstance(window, str): pandas_df[on] = pandas.date_range("22/06/1941", periods=12, freq="T") modin_df[on] = pd.date_range("22/06/1941", periods=12, freq="T") else: on = None if axis == "columns": pandas_df = pandas_df.T modin_df = modin_df.T pandas_rolled = pandas_df.rolling(window=window, on=on, axis=axis, closed=closed) modin_rolled = modin_df.rolling(window=window, on=on, axis=axis, closed=closed) if isinstance(window, int): # This functions are very slowly for data from test_rolling df_equals(modin_rolled.corr(modin_df, True), pandas_rolled.corr(pandas_df, True)) df_equals(modin_rolled.corr(modin_df, False), pandas_rolled.corr(pandas_df, False)) df_equals(modin_rolled.cov(modin_df, True), pandas_rolled.cov(pandas_df, True)) df_equals(modin_rolled.cov(modin_df, False), pandas_rolled.cov(pandas_df, False)) if axis == 0: df_equals( modin_rolled.cov(modin_df[modin_df.columns[0]], True), pandas_rolled.cov(pandas_df[pandas_df.columns[0]], True), ) df_equals( modin_rolled.corr(modin_df[modin_df.columns[0]], True), pandas_rolled.corr(pandas_df[pandas_df.columns[0]], True), ) else: df_equals(modin_rolled.count(), pandas_rolled.count()) df_equals(modin_rolled.skew(), pandas_rolled.skew()) df_equals( modin_rolled.apply(np.sum, raw=True), pandas_rolled.apply(np.sum, raw=True), ) df_equals(modin_rolled.aggregate(np.sum), pandas_rolled.aggregate(np.sum)) df_equals(modin_rolled.quantile(0.1), pandas_rolled.quantile(0.1))
def test_reindex_like(): df1 = pd.DataFrame( [ [24.3, 75.7, "high"], [31, 87.8, "high"], [22, 71.6, "medium"], [35, 95, "medium"], ], columns=["temp_celsius", "temp_fahrenheit", "windspeed"], index=pd.date_range(start="2014-02-12", end="2014-02-15", freq="D"), ) df2 = pd.DataFrame( [[28, "low"], [30, "low"], [35.1, "medium"]], columns=["temp_celsius", "windspeed"], index=pd.DatetimeIndex(["2014-02-12", "2014-02-13", "2014-02-15"]), ) with pytest.warns(UserWarning): df2.reindex_like(df1)
def test_to_timestamp(): idx = pd.date_range("1/1/2012", periods=5, freq="M") df = pd.DataFrame(np.random.randint(0, 100, size=(len(idx), 4)), index=idx) with pytest.warns(UserWarning): df.to_period().to_timestamp()
def test_tshift(): idx = pd.date_range("1/1/2012", periods=5, freq="M") data = np.random.randint(0, 100, size=(len(idx), 4)) modin_df = pd.DataFrame(data, index=idx) pandas_df = pandas.DataFrame(data, index=idx) df_equals(modin_df.tshift(4), pandas_df.tshift(4))
def test_to_timestamp(): idx = pd.date_range("1/1/2012", periods=5, freq="M") df = pd.DataFrame(np.random.randint(0, 100, size=(len(idx), 4)), index=idx) with warns_that_defaulting_to_pandas(): df.to_period().to_timestamp()