def test_subset(): df = mdf.MicroDataFrame( {"x": [1, 2], "y": [3, 4], "z": [5, 6]}, weights=[7, 8] ) df_no_z = mdf.MicroDataFrame({"x": [1, 2], "y": [3, 4]}, weights=[7, 8]) assert df[["x", "y"]].equals(df_no_z) df_no_z_diff_weights = df_no_z.copy() df_no_z_diff_weights.weights += 1 assert not df[["x", "y"]].equals(df_no_z_diff_weights)
def test_df_init(): arr = np.array([0, 1, 1]) w = np.array([3, 0, 9]) df = mdf.MicroDataFrame({"a": arr}, weights=w) assert df.a.mean() == np.average(arr, weights=w) df = mdf.MicroDataFrame() df["a"] = arr df.set_weights(w) assert df.a.mean() == np.average(arr, weights=w) df = mdf.MicroDataFrame() df["a"] = arr df["w"] = w df.set_weight_col("w") assert df.a.mean() == np.average(arr, weights=w)
def test_concat(): df1 = mdf.MicroDataFrame({"x": [1, 2]}, weights=[3, 4]) df2 = mdf.MicroDataFrame({"y": [5, 6]}, weights=[7, 8]) # Verify that pd.concat returns DataFrame (probably no way to fix this). pd_long = pd.concat([df1, df2]) assert isinstance(pd_long, pd.DataFrame) assert not isinstance(pd_long, mdf.MicroDataFrame) # Verify that mdf.concat works. mdf_long = mdf.concat([df1, df2]) assert isinstance(mdf_long, mdf.MicroDataFrame) # Weights should be preserved. assert mdf_long.weights.equals(pd.concat([df1.weights, df2.weights])) # Verify it works horizontally too (take the first set of weights). mdf_wide = mdf.concat([df1, df2], axis=1) assert isinstance(mdf_wide, mdf.MicroDataFrame) assert mdf_wide.weights.equals(df1.weights)
def test_copy_equals(): d = mdf.MicroDataFrame( {"x": [1, 2], "y": [3, 4], "z": [5, 6]}, weights=[7, 8] ) d_copy = d.copy() d_copy_diff_weights = d_copy.copy() d_copy_diff_weights.weights *= 2 assert d.equals(d_copy) assert not d.equals(d_copy_diff_weights) # Same for a MicroSeries. assert d.x.equals(d_copy.x) assert not d.x.equals(d_copy_diff_weights.x)
def concat(*args, **kwargs): """Concatenates MicroDataFrame objects, preserving weights. If concatenating horizontally, the first set of weights are used. All args and kwargs are passed to pd.concat. :return: MicroDataFrame with concatenated weights. :rtype: mdf.MicroDataFrame """ # Extract args with respect to pd.concat. pd_args = inspect.getcallargs(pd.concat, *args, **kwargs) objs = pd_args["objs"] axis = pd_args["axis"] # Create result, starting with pd.concat. res = mdf.MicroDataFrame(pd.concat(*args, **kwargs)) # Assign weights depending on axis. if axis == 0: res.weights = pd.concat([obj.weights for obj in objs]) else: # If concatenating horizontally, use the first set of weights. res.weights = objs[0].weights return res
import pandas as pd import pytest import microdf as mdf X = [1, 5, 2] Y = [0, -6, 3] W = [4, 1, 1] df = pd.DataFrame({"x": X, "y": Y, "w": W}) ms = mdf.MicroSeries(X, weights=W) md = mdf.MicroDataFrame(df[["x", "y"]], weights=W) # Also make a version with groups. df2 = df.copy(deep=True) df2.x *= 2 df2.y *= 1.5 dfg = pd.concat([df, df2]) dfg["g"] = ["a"] * 3 + ["b"] * 3 mdg = mdf.MicroDataFrame(dfg[["x", "y", "g"]], weights=W) def test_weighted_quantile(): Q = [0, 0.5, 1] mdf.weighted_quantile(df, "x", "w", Q).tolist() def test_weighted_median(): assert mdf.weighted_median(df, "x") == 2 mdf.weighted_median(df, "x", "w") # Test with groups. mdf.weighted_median(dfg, "x", "w", "g")
def df(self, cols, map_to="person"): df = {} for var in cols: df[var] = self.calc(var, map_to=map_to) return mdf.MicroDataFrame(df, weights=self.weight_vars[map_to])
import microdf as mdf import numpy as np import pandas as pd df = pd.DataFrame({ "income": [-10, 0, 10, 20], "threshold": [15, 10, 15, 10], "weight": [1, 2, 3, 4], }) md = mdf.MicroDataFrame(df[["income", "threshold"]], weights=df.weight) def test_poverty_rate(): # Unweighted assert np.allclose(mdf.poverty_rate(df, "income", "threshold"), 3 / 4) # Weighted assert np.allclose(mdf.poverty_rate(df, "income", "threshold", "weight"), 6 / 10) assert np.allclose(md.poverty_rate("income", "threshold"), 6 / 10) def test_deep_poverty_rate(): # Unweighted assert np.allclose(mdf.deep_poverty_rate(df, "income", "threshold"), 2 / 4) # Weighted assert np.allclose( mdf.deep_poverty_rate(df, "income", "threshold", "weight"), 3 / 10) assert np.allclose(md.deep_poverty_rate("income", "threshold"), 3 / 10)
def test_multiple_groupby(): df = mdf.MicroDataFrame({"x": [1, 2], "y": [3, 4], "z": [5, 6]}) assert (df.groupby(["x", "y"]).z.sum() == np.array([5, 6])).all()
def test_unweighted_groupby(): df = mdf.MicroDataFrame({"x": [1, 2], "y": [3, 4], "z": [5, 6]}) assert (df.groupby("x").z.sum().values == np.array([5.0, 6.0])).all()
def test_value_subset(): d = mdf.MicroDataFrame({"x": [1, 2, 3], "y": [1, 2, 2]}, weights=[4, 5, 6]) d2 = d[d.y > 1] assert d2.y.shape == d2.weights.shape
def test_reset_index(): d = mdf.MicroDataFrame(dict(x=[1, 2, 3]), weights=[4, 5, 6]) assert d.reset_index().__class__ == MicroDataFrame
def test_set_index(): d = mdf.MicroDataFrame(dict(x=[1, 2, 3]), weights=[4, 5, 6]) assert d.x.__class__ == MicroSeries d.index = [1, 2, 3] assert d.x.__class__ == MicroSeries
"in_deep_poverty_bhc", ] BASELINE_HH_COLS = ["household_weight", "people", "region"] # Extract these for baseline too. REFORM_HH_COLS = [ "household_net_income", "equiv_household_net_income", "poverty_gap_bhc", "poverty_gap_ahc", ] p_base = mdf.MicroDataFrame( baseline_sim.df(BASELINE_PERSON_COLS + REFORM_PERSON_COLS, map_to="person"), weights="household_weight", ) p_base.rename( dict(zip(REFORM_PERSON_COLS, [i + "_base" for i in REFORM_PERSON_COLS])), axis=1, inplace=True, ) hh_base = mdf.MicroDataFrame( baseline_sim.df(BASELINE_HH_COLS + REFORM_HH_COLS, map_to="household"), weights="household_weight", ) hh_base.rename( dict(zip(REFORM_HH_COLS, [i + "_base" for i in REFORM_HH_COLS])), axis=1,