def test_obj_vs_from_arrays(): np.random.seed(3141) dml_data = make_plr_CCDDHNR2018(n_obs=100) dml_data_from_array = DoubleMLData.from_arrays( dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols]) assert dml_data_from_array.data.equals(dml_data.data) dml_data = _make_pliv_data(n_obs=100) dml_data_from_array = DoubleMLData.from_arrays( dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols], dml_data.data[dml_data.z_cols]) assert dml_data_from_array.data.equals(dml_data.data) dml_data = make_pliv_CHS2015(n_obs=100, dim_z=5) dml_data_from_array = DoubleMLData.from_arrays( dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols], dml_data.data[dml_data.z_cols]) assert np.array_equal(dml_data_from_array.data, dml_data.data) # z_cols name differ dml_data = make_plr_CCDDHNR2018(n_obs=100) df = dml_data.data.copy().iloc[:, :10] df.columns = [f'X{i+1}' for i in np.arange(7)] + ['y', 'd1', 'd2'] dml_data = DoubleMLData(df, 'y', ['d1', 'd2'], [f'X{i+1}' for i in np.arange(7)]) dml_data_from_array = DoubleMLData.from_arrays( dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols]) assert np.array_equal(dml_data_from_array.data, dml_data.data)
def test_obj_vs_from_arrays(): np.random.seed(3141) dml_data = make_plr_CCDDHNR2018(n_obs=100) dml_data_from_array = DoubleMLData.from_arrays(dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols]) assert dml_data_from_array.data.equals(dml_data.data) dml_data = _make_pliv_data(n_obs=100) dml_data_from_array = DoubleMLData.from_arrays(dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols], dml_data.data[dml_data.z_cols]) assert dml_data_from_array.data.equals(dml_data.data) dml_data = make_pliv_CHS2015(n_obs=100, dim_z=5) dml_data_from_array = DoubleMLData.from_arrays(dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols], dml_data.data[dml_data.z_cols]) assert np.array_equal(dml_data_from_array.data, dml_data.data) # z_cols name differ dml_data = make_plr_CCDDHNR2018(n_obs=100) df = dml_data.data.copy().iloc[:, :10] df.columns = [f'X{i+1}' for i in np.arange(7)] + ['y', 'd1', 'd2'] dml_data = DoubleMLData(df, 'y', ['d1', 'd2'], [f'X{i+1}' for i in np.arange(7)]) dml_data_from_array = DoubleMLData.from_arrays(dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols]) assert np.array_equal(dml_data_from_array.data, dml_data.data) dml_data = make_pliv_multiway_cluster_CKMS2021(N=10, M=10) dml_data_from_array = DoubleMLClusterData.from_arrays(dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols], dml_data.data[dml_data.cluster_cols], dml_data.data[dml_data.z_cols]) df = dml_data.data.copy() df.rename(columns={'cluster_var_i': 'cluster_var1', 'cluster_var_j': 'cluster_var2', 'Y': 'y', 'D': 'd', 'Z': 'z'}, inplace=True) assert dml_data_from_array.data.equals(df) # with a single cluster variable dml_data_from_array = DoubleMLClusterData.from_arrays(dml_data.data[dml_data.x_cols], dml_data.data[dml_data.y_col], dml_data.data[dml_data.d_cols], dml_data.data[dml_data.cluster_cols[1]], dml_data.data[dml_data.z_cols]) df = dml_data.data.copy().drop(columns='cluster_var_i') df.rename(columns={'cluster_var_j': 'cluster_var', 'Y': 'y', 'D': 'd', 'Z': 'z'}, inplace=True) assert dml_data_from_array.data.equals(df)
def test_dml_data_w_missings(generate_data_irm_w_missings): (x, y, d) = generate_data_irm_w_missings dml_data = DoubleMLData.from_arrays(x, y, d, force_all_x_finite=False) _ = DoubleMLData.from_arrays(x, y, d, force_all_x_finite='allow-nan') msg = r"Input contains NaN, infinity or a value too large for dtype\('float64'\)." with pytest.raises(ValueError, match=msg): _ = DoubleMLData.from_arrays(x, y, d, force_all_x_finite=True) with pytest.raises(ValueError, match=msg): _ = DoubleMLData.from_arrays(x, x[:, 0], d, force_all_x_finite=False) with pytest.raises(ValueError, match=msg): _ = DoubleMLData.from_arrays(x, y, x[:, 0], force_all_x_finite=False) with pytest.raises(ValueError, match=msg): _ = DoubleMLData.from_arrays(x, y, d, x[:, 0], force_all_x_finite=False) msg = r"Input contains infinity or a value too large for dtype\('float64'\)." xx = np.copy(x) xx[0, 0] = np.inf with pytest.raises(ValueError, match=msg): _ = DoubleMLData.from_arrays(xx, y, d, force_all_x_finite='allow-nan') msg = "Invalid force_all_x_finite. force_all_x_finite must be True, False or 'allow-nan'." with pytest.raises(TypeError, match=msg): _ = DoubleMLData.from_arrays(xx, y, d, force_all_x_finite=1) with pytest.raises(TypeError, match=msg): _ = DoubleMLData(dml_data.data, y_col='y', d_cols='d', force_all_x_finite=1) msg = "Invalid force_all_x_finite allownan. force_all_x_finite must be True, False or 'allow-nan'." with pytest.raises(ValueError, match=msg): _ = DoubleMLData.from_arrays(xx, y, d, force_all_x_finite='allownan') with pytest.raises(ValueError, match=msg): _ = DoubleMLData(dml_data.data, y_col='y', d_cols='d', force_all_x_finite='allownan') msg = r"Input contains NaN, infinity or a value too large for dtype\('float64'\)." with pytest.raises(ValueError, match=msg): dml_data.force_all_x_finite = True assert dml_data.force_all_x_finite is True dml_data.force_all_x_finite = False assert dml_data.force_all_x_finite is False dml_data.force_all_x_finite = 'allow-nan' assert dml_data.force_all_x_finite == 'allow-nan'
def test_dml_data_no_instr(): np.random.seed(3141) dml_data = make_plr_CCDDHNR2018(n_obs=100) assert dml_data.z is None assert dml_data.n_instr == 0 x, y, d = make_plr_CCDDHNR2018(n_obs=100, return_type='array') dml_data = DoubleMLData.from_arrays(x, y, d) assert dml_data.z is None assert dml_data.n_instr == 0
def dml_data_fixture(generate_data1): data = generate_data1 np.random.seed(3141) x_cols = data.columns[data.columns.str.startswith('X')].tolist() obj_from_np = DoubleMLData.from_arrays(data.loc[:, x_cols].values, data['y'].values, data['d'].values) obj_from_pd = DoubleMLData(data, 'y', ['d'], x_cols) return {'obj_from_np': obj_from_np, 'obj_from_pd': obj_from_pd}
np.random.seed(3141) dml_data = make_plr_CCDDHNR2018(n_obs=10) ml_g = Lasso() ml_m = Lasso() ml_r = Lasso() dml_plr = DoubleMLPLR(dml_data, ml_g, ml_m) dml_data_irm = make_irm_data(n_obs=10) dml_data_iivm = make_iivm_data(n_obs=10) dml_data_pliv = make_pliv_CHS2015(n_obs=10, dim_z=1) dml_cluster_data_pliv = make_pliv_multiway_cluster_CKMS2021(N=10, M=10) (x, y, d, z) = make_iivm_data(n_obs=30, return_type="array") y[y > 0] = 1 y[y < 0] = 0 dml_data_irm_binary_outcome = DoubleMLData.from_arrays(x, y, d) dml_data_iivm_binary_outcome = DoubleMLData.from_arrays(x, y, d, z) @pytest.mark.ci def test_doubleml_exception_data(): msg = 'The data must be of DoubleMLData type.' with pytest.raises(TypeError, match=msg): _ = DoubleMLPLR(pd.DataFrame(), ml_g, ml_m) # PLR with IV msg = (r'Incompatible data. Z1 have been set as instrumental variable\(s\). ' 'To fit a partially linear IV regression model use DoubleMLPLIV instead of DoubleMLPLR.') with pytest.raises(ValueError, match=msg): _ = DoubleMLPLR(dml_data_pliv, ml_g, ml_m)