def test_survival_outcome(self): topology = np.zeros((5, 5), dtype=bool) topology[3, 0] = topology[4, 0] = topology[3, 1] = topology[ 3, 2] = topology[4, 2] = topology[4, 3] = True var_types = [ "covariate", "covariate", "hidden", "treatment", "outcome" ] link_types = ["linear"] * 5 prob_cat = [None] * 5 prob_cat[3] = [0.2, 0.8] outcome_type = "survival" snr = 0.95 treatment_importance = 0.5 treatment_method = "logistic" survival_distribution = "expon" survival_baseline = 0.8 sim = CS3(topology=topology, var_types=var_types, prob_categories=prob_cat, link_types=link_types, snr=snr, treatment_importances=treatment_importance, outcome_types=outcome_type, effect_sizes=self.no_X.effect_sizes, treatment_methods=treatment_method, survival_distribution=survival_distribution, survival_baseline=survival_baseline) num_samples = self.NUM_SAMPLES X, prop, cf = sim.generate_data(num_samples=num_samples)
def test_multi_categorical_treatment(self): t_probs = pd.Series([0.2, 0.2, 0.1, 0.5]) prob_cat = [None, None, t_probs, None] treatment_methods = ["quantile_gauss_fit", "odds_ratio"] decimals = [1, 1] for treatment_method, decimal in zip(treatment_methods, decimals): sim = CS3(topology=self.no_X.topology, var_types=self.no_X.var_types, prob_categories=prob_cat, link_types=self.no_X.link_types, treatment_importances=self.no_X.treatment_importance, outcome_types=self.no_X.outcome_types, snr=self.no_X.snr, effect_sizes=self.no_X.effect_sizes, treatment_methods=treatment_method) n = self.NUM_SAMPLES * 10 X, prop, cf = sim.generate_data(num_samples=n) np.testing.assert_array_almost_equal( prop.sum(axis="columns"), np.ones(n), err_msg= "multi-categorical preopensities of treatment method {method} " "does not sum to 1".format(method=treatment_method)) np.testing.assert_array_almost_equal( np.array(X[2].value_counts(normalize=True) - t_probs), np.zeros(4), decimal=decimal, err_msg= "treatment method {method} does not produce proportions as " "required".format(method=treatment_method))
def test_treatment_random(self): topology = np.zeros((6, 6), dtype=bool) topology[2, 0] = topology[3, 0] = topology[2, 1] = topology[ 3, 1] = topology[4, 2] = topology[5, 3] = True var_types = [ "covariate", "covariate", "treatment", "treatment", "outcome", "outcome" ] link_types = ["linear"] * 6 prob_cat = [None] * 6 prob_cat[2] = [0.5, 0.5] prob_cat[3] = [0.2, 0.8] sim = CS3(topology=topology, var_types=var_types, prob_categories=prob_cat, link_types=link_types, snr=self.no_X.snr, treatment_importances=self.no_X.treatment_importance, outcome_types=["continuous", "continuous"], effect_sizes=self.no_X.effect_sizes, treatment_methods=["random", "random"]) num_samples = self.NUM_SAMPLES * 10 X, prop, cf = sim.generate_data(num_samples=num_samples) np.testing.assert_array_equal(prop[2][1], [0.5] * num_samples) np.testing.assert_array_equal(prop[3][1], [0.8] * num_samples) hist = X[2].value_counts(normalize=True) np.testing.assert_almost_equal(hist, [0.5, 0.5], decimal=2) hist = X[3].value_counts(normalize=True) np.testing.assert_almost_equal(hist.sort_index(), [0.2, 0.8], decimal=2)
def test_linear_linking(self): topology = np.zeros((3, 3), dtype=bool) topology[2, 0] = topology[2, 1] = True var_types = ["covariate", "treatment", "outcome"] snr = 1 prob_cat = [None, [0.5, 0.5], None] treatment_importance = 0.5 sim = CS3(topology=topology, var_types=var_types, prob_categories=prob_cat, link_types="linear", treatment_importances=treatment_importance, outcome_types=self.no_X.outcome_types, snr=snr, effect_sizes=self.no_X.effect_sizes) X, prop, cf = sim.generate_data(num_samples=self.NUM_SAMPLES) singular_values = np.linalg.svd(X.values, compute_uv=False) eps = 1e-10 rank = np.sum(singular_values > eps) self.assertEqual( rank, 2, msg="discovered rank of matrix is {emp} instead of {des}." "so the linear linking does not work properly".format(emp=rank, des=2))
def test_multi_treatment_outcome(self): topology = np.zeros((6, 6), dtype=bool) topology[2, 0] = topology[3, 0] = topology[2, 1] = topology[ 3, 1] = topology[4, 2] = topology[5, 3] = True var_types = [ "covariate", "covariate", "treatment", "treatment", "outcome", "outcome" ] link_types = ["linear"] * 6 prob_cat = [None] * 6 prob_cat[2] = prob_cat[3] = [0.5, 0.5] sim = CS3(topology=topology, var_types=var_types, prob_categories=prob_cat, link_types=link_types, snr=self.no_X.snr, treatment_importances=self.no_X.treatment_importance, outcome_types=["continuous", "continuous"], effect_sizes=self.no_X.effect_sizes) X, prop, cf = sim.generate_data(num_samples=self.NUM_SAMPLES) self.assertEqual(prop.shape, (self.NUM_SAMPLES, 4), msg="Generated propensity shape is {X} " "but supposed to be {supp}".format( X=prop.shape, supp=(self.NUM_SAMPLES, 4))) self.assertEqual(cf.shape, (self.NUM_SAMPLES, 4), msg="Number of generated counterfactuals is {X} " "but supposed to be {supp}".format( X=cf.shape, supp=(self.NUM_SAMPLES, 4)))
def test_different_types_of_paramaters(self): """ Tests to see what happens when supplying parameters of different types (lists, dicts, arrays, etc.) """ topology = np.zeros( (5, 5), dtype=np.bool) # topology[i,j] if node j is a parent of node i topology[1, 0] = topology[2, 0] = topology[2, 1] = topology[ 3, 1] = topology[3, 2] = topology[3, 4] = True var_types = [ "hidden", "covariate", "treatment", "outcome", "covariate" ] sim = CS3( topology=topology, var_types=var_types, prob_categories=self.no_X.prob_cat + [None], link_types=None, treatment_importances=pd.Series(data=0.7, index=[2]), outcome_types={3: "continuous"}, snr=0.5, # effect_sizes={2: 0.8}, effect_sizes=[0.8], treatment_methods=["gaussian"]) self.assertTrue(all(sim.link_types == "linear")) self.assertEqual(len(sim.link_types), 5) self.assertTrue(all([x == 0.7 for x in sim.treatment_importances])) self.assertTrue(sim.outcome_types.equals(pd.Series({3: "continuous"}))) self.assertTrue(all(sim.snr == 0.5)) self.assertEqual(len(sim.snr), 5) self.assertTrue(sim.effect_sizes.equals(pd.Series({3: 0.8})))
def test_dependency_from_topology(self): """ Tests to see that the matrix topology is well converted into graph dependencies for with and without dataset. """ sim = CS3(topology=self.no_X.topology, var_types=self.no_X.var_types, prob_categories=self.no_X.prob_cat, link_types=self.no_X.link_types, treatment_importances=self.no_X.treatment_importance, outcome_types=self.no_X.outcome_types, snr=self.no_X.snr, effect_sizes=self.no_X.effect_sizes) self.dependency_from_topology(sim) sim = CS3(topology=self.with_X.topology, var_types=self.with_X.var_types, prob_categories=self.with_X.prob_cat, link_types=self.with_X.link_types, treatment_importances=self.with_X.treatment_importance, outcome_types=self.with_X.outcome_types, snr=self.with_X.snr, effect_sizes=self.with_X.effect_sizes) self.dependency_from_topology(sim)
def test_treatment_logistic(self): topology = np.zeros((6, 6), dtype=bool) topology[2, 0] = topology[3, 0] = topology[2, 1] = topology[ 3, 1] = topology[4, 2] = topology[5, 3] = True var_types = [ "covariate", "covariate", "treatment", "treatment", "outcome", "outcome" ] link_types = ["linear"] * 6 prob_cat = [None] * 6 prob_cat[2] = prob_cat[3] = [0.5, 0.5] sim = CS3(topology=topology, var_types=var_types, prob_categories=prob_cat, link_types=link_types, snr=self.no_X.snr, treatment_importances=self.no_X.treatment_importance, outcome_types=["continuous", "continuous"], effect_sizes=self.no_X.effect_sizes, treatment_methods=["logistic", "logistic"]) X, prop, cf = sim.generate_data(num_samples=self.NUM_SAMPLES)
def test_categorical_proportions(self): probs = np.array([0.25, 0.25, 0.5]) prob_cat = self.no_X.prob_cat prob_cat[1] = probs sim = CS3(topology=self.no_X.topology, var_types=self.no_X.var_types, prob_categories=prob_cat, link_types=self.no_X.link_types, treatment_importances=self.no_X.treatment_importance, outcome_types=self.no_X.outcome_types, snr=self.no_X.snr, effect_sizes=self.no_X.effect_sizes) X, prop, cf = sim.generate_data(num_samples=self.NUM_SAMPLES * 10, random_seed=0) # hist = np.array(X.loc[:, 1].value_counts(normalize=True)) hist = X.loc[:, 1].value_counts(normalize=True) probs = pd.Series(probs) np.testing.assert_array_almost_equal( probs - hist, pd.Series(data=0, index=probs.index), decimal=2, err_msg="Empirical distribution {emp} of categories " "is too far from desired distribution {des}".format(emp=hist, des=probs))
def test_bad_input(self): # lengths: # with self.assertRaises(ValueError) as assert_checker: var_types = ["covariate", "treatment", "outcome"] self.assertRaises(ValueError, CS3, topology=self.no_X.topology, var_types=var_types, prob_categories=self.no_X.prob_cat, link_types=self.no_X.link_types, treatment_importances=self.no_X.treatment_importance, snr=self.no_X.snr, outcome_types=self.no_X.outcome_types, effect_sizes=self.no_X.effect_sizes) # outcome has more than one treatment predecessor: var_types = ["covariate", "treatment", "treatment", "outcome"] self.assertRaises(ValueError, CS3, topology=self.no_X.topology, var_types=var_types, prob_categories=self.no_X.prob_cat, link_types=self.no_X.link_types, treatment_importances=self.no_X.treatment_importance, snr=self.no_X.snr, outcome_types=self.no_X.outcome_types, effect_sizes=self.no_X.effect_sizes) # No valid link type: self.assertRaises(ValueError, CS3, topology=self.no_X.topology, var_types=self.no_X.var_types, prob_categories=self.no_X.prob_cat, link_types=["linear", "linear", "linear", "leniar"], treatment_importances=self.no_X.treatment_importance, snr=self.no_X.snr, outcome_types=self.no_X.outcome_types, effect_sizes=self.no_X.effect_sizes) # No valid treatment method: self.assertRaises(ValueError, CS3, topology=self.no_X.topology, var_types=self.no_X.var_types, prob_categories=self.no_X.prob_cat, link_types=self.no_X.link_types, treatment_importances=self.no_X.treatment_importance, snr=self.no_X.snr, outcome_types=self.no_X.outcome_types, effect_sizes=self.no_X.effect_sizes, treatment_methods="rndom") # lengths: self.assertRaises(ValueError, CS3, topology=self.no_X.topology, var_types=self.no_X.var_types, prob_categories=self.no_X.prob_cat, link_types=self.no_X.link_types, treatment_importances=self.no_X.treatment_importance, snr=[0, 1], outcome_types=self.no_X.outcome_types, effect_sizes=self.no_X.effect_sizes, treatment_methods="gaussian") self.assertRaises(ValueError, CS3, topology=self.no_X.topology, var_types=self.no_X.var_types, prob_categories=self.no_X.prob_cat, link_types=self.no_X.link_types, treatment_importances=[0.5, 0.5], snr=self.no_X.snr, treatment_methods="gaussian", outcome_types=self.no_X.outcome_types, effect_sizes=self.no_X.effect_sizes) # no generation input: sim = CS3(topology=self.no_X.topology, var_types=self.no_X.var_types, prob_categories=self.no_X.prob_cat, link_types=self.no_X.link_types, treatment_importances=self.no_X.treatment_importance, snr=self.no_X.snr, outcome_types=self.no_X.outcome_types, effect_sizes=self.no_X.effect_sizes) self.assertRaises(ValueError, sim.generate_data) # categorical treatment: self.assertRaises(ValueError, sim.generate_treatment_col, X_parents=pd.DataFrame([None]), link_type=None, snr=1, prob_category=None) # wrong probabilities: prob_cat = [[0.5, -0.5, 1], None, [0.5, 0.5], None] sim = CS3(topology=self.no_X.topology, var_types=self.no_X.var_types, prob_categories=prob_cat, link_types=self.no_X.link_types, treatment_importances=self.no_X.treatment_importance, snr=self.no_X.snr, outcome_types=self.no_X.outcome_types, effect_sizes=self.no_X.effect_sizes) self.assertRaises(ValueError, sim.generate_data, num_samples=100) prob_cat = [None, None, [0.5, 0.6], None] sim = CS3(topology=self.no_X.topology, var_types=self.no_X.var_types, prob_categories=prob_cat, link_types=self.no_X.link_types, treatment_importances=self.no_X.treatment_importance, snr=self.no_X.snr, outcome_types=self.no_X.outcome_types, effect_sizes=self.no_X.effect_sizes) self.assertRaises(ValueError, sim.generate_data, num_samples=100)
def test_censoring(self): # survival censor topology = np.zeros((5, 5), dtype=bool) topology[2, 0] = topology[3, 0] = topology[4, 0] = topology[ 2, 1] = topology[3, 1] = topology[4, 1] = topology[ 3, 2] = topology[4, 2] = topology[ 4, 3] = True # make censor be dependent like the outcome var_types = [ "covariate", "covariate", "treatment", "censor", "outcome" ] link_types = ["linear"] * 5 prob_cat = [None, None, [0.2, 0.8], [0.85, 0.15], None] outcome_type = "survival" snr = 0.95 treatment_importance = 0.5 treatment_method = "logistic" survival_distribution = "expon" survival_baseline = 0.8 sim = CS3(topology=topology, var_types=var_types, prob_categories=prob_cat, link_types=link_types, snr=snr, treatment_importances=treatment_importance, outcome_types=outcome_type, effect_sizes=self.no_X.effect_sizes, treatment_methods=treatment_method, survival_distribution=survival_distribution, survival_baseline=survival_baseline) num_samples = self.NUM_SAMPLES X, prop, cf = sim.generate_data(num_samples=num_samples, random_seed=783454) self.assertAlmostEqual(np.abs(X[4].le(X[3]).sum() / num_samples), prob_cat[3][0], places=1) # df_obs, df_cf = sim.format_for_training(X, prop, cf) # binary censor topology = np.zeros((5, 5), dtype=bool) topology[2, 0] = topology[4, 0] = topology[2, 1] = topology[ 3, 1] = topology[4, 2] = topology[4, 3] = True var_types = [ "covariate", "covariate", "treatment", "censor", "outcome" ] link_types = ["linear"] * 5 prob_cat = [None, None, [0.2, 0.8], [0.85, 0.15], None] outcome_type = "continuous" snr = 0.95 treatment_importance = 0.5 treatment_method = "logistic" sim = CS3(topology=topology, var_types=var_types, prob_categories=prob_cat, link_types=link_types, snr=snr, treatment_importances=treatment_importance, outcome_types=outcome_type, effect_sizes=self.no_X.effect_sizes, treatment_methods=treatment_method) num_samples = self.NUM_SAMPLES X, prop, cf = sim.generate_data(num_samples=num_samples) self.assertAlmostEqual(X[3].astype(int).sum() / num_samples, prob_cat[3][1]) df_obs, df_cf = sim.format_for_training(X, prop, cf) self.assertEqual(X[3].astype(int).sum(), df_obs["y_4"].isnull().sum()) # independent categorical censor topology = np.zeros((5, 5), dtype=bool) topology[3, 0] = topology[4, 0] = topology[3, 2] = topology[ 4, 2] = topology[4, 3] = True var_types = [ "covariate", "covariate", "treatment", "censor", "outcome" ] link_types = ["linear"] * 5 prob_cat = [None, None, [0.2, 0.8], [0.85, 0.15], None] outcome_type = "continuous" snr = 0.95 treatment_importance = 0.5 treatment_method = "logistic" survival_distribution = "expon" survival_baseline = 0.8 sim = CS3(topology=topology, var_types=var_types, prob_categories=prob_cat, link_types=link_types, snr=snr, treatment_importances=treatment_importance, outcome_types=outcome_type, effect_sizes=self.no_X.effect_sizes, treatment_methods=treatment_method, survival_distribution=survival_distribution, survival_baseline=survival_baseline) num_samples = self.NUM_SAMPLES X, prop, cf = sim.generate_data(num_samples=num_samples) self.assertAlmostEqual(X[3].astype(int).sum() / num_samples, prob_cat[3][1]) # df_obs, df_cf = sim.format_for_training(X, prop, cf) # independent survival censor topology = np.zeros((5, 5), dtype=bool) topology[3, 0] = topology[4, 0] = topology[3, 2] = topology[ 4, 2] = topology[4, 3] = True var_types = [ "covariate", "covariate", "treatment", "censor", "outcome" ] link_types = ["linear"] * 5 prob_cat = [None, None, [0.2, 0.8], [0.85, 0.15], None] outcome_type = "survival" snr = 0.95 treatment_importance = 0.5 treatment_method = "logistic" survival_distribution = "expon" survival_baseline = 0.8 sim = CS3(topology=topology, var_types=var_types, prob_categories=prob_cat, link_types=link_types, snr=snr, treatment_importances=treatment_importance, outcome_types=outcome_type, effect_sizes=self.no_X.effect_sizes, treatment_methods=treatment_method, survival_distribution=survival_distribution, survival_baseline=survival_baseline) num_samples = 10000 X, prop, cf = sim.generate_data(num_samples=num_samples)
def test_dataset_size(self): """ Tests to see the the size of the generated dataset is ok under several configurations """ # No given X, all non-hidden var_types = ["covariate", "covariate", "treatment", "outcome"] sim = CS3(topology=self.no_X.topology, var_types=var_types, prob_categories=self.no_X.prob_cat, link_types=self.no_X.link_types, treatment_importances=self.no_X.treatment_importance, outcome_types=self.no_X.outcome_types, snr=self.no_X.snr, effect_sizes=self.no_X.effect_sizes) X, prop, cf = sim.generate_data(num_samples=self.NUM_SAMPLES) self.assertEqual(X.shape, (self.NUM_SAMPLES, 4), msg="Generated dataset shape is {X} " "but supposed to be {supp}".format( X=X.shape, supp=(self.NUM_SAMPLES, 4))) self.assertEqual(prop.shape, (self.NUM_SAMPLES, 2), msg="Generated propensity shape is {X} " "but supposed to be {supp}".format( X=prop.shape, supp=(self.NUM_SAMPLES, 2))) self.assertEqual(cf.shape, (self.NUM_SAMPLES, 2), msg="number of generated counterfactuals is {X} " "but supposed to be {supp}".format( X=cf.shape, supp=(self.NUM_SAMPLES, 2))) df_obs, df_cf = sim.format_for_training(X, prop, cf) self.assertEqual(df_obs.shape, (self.NUM_SAMPLES, 4), msg="Generated dataset shape is {X} " "but supposed to be {supp}".format( X=df_obs.shape, supp=(self.NUM_SAMPLES, 4))) # No given X, with hidden var_types = ["hidden", "hidden", "treatment", "outcome"] sim = CS3(topology=self.no_X.topology, var_types=var_types, prob_categories=self.no_X.prob_cat, link_types=self.no_X.link_types, treatment_importances=self.no_X.treatment_importance, outcome_types=self.no_X.outcome_types, snr=self.no_X.snr, effect_sizes=self.no_X.effect_sizes) X, prop, cf = sim.generate_data(num_samples=self.NUM_SAMPLES) df_obs, df_cf = sim.format_for_training(X, prop, cf) self.assertEqual(df_obs.shape, (self.NUM_SAMPLES, 2), msg="Generated dataset shape is {X} " "but supposed to be {supp}".format( X=df_obs.shape, supp=(self.NUM_SAMPLES, 2))) # Given X, with hidden vars sim = CS3(topology=self.with_X.topology, var_types=self.with_X.var_types, prob_categories=self.with_X.prob_cat, link_types=self.with_X.link_types, treatment_importances=self.with_X.treatment_importance, outcome_types=self.with_X.outcome_types, snr=self.with_X.snr, effect_sizes=self.with_X.effect_sizes) X, prop, cf = sim.generate_data(X_given=self.X_GIVEN) self.assertEqual(X.shape, (self.X_NUM_SAMPLES, 9), msg="Generated dataset shape is {X} " "but supposed to be {supp}".format( X=X.shape, supp=(self.X_NUM_SAMPLES, 9))) self.assertEqual(prop.shape, (self.X_NUM_SAMPLES, 2), msg="Generated propensity shape is {X} " "but supposed to be {supp}".format( X=prop.shape, supp=(self.X_NUM_SAMPLES, 2))) self.assertEqual(cf.shape, (self.X_NUM_SAMPLES, 2), msg="Number of counterfactuals generated is {X} " "but supposed to be {supp}".format( X=cf.shape, supp=(self.X_NUM_SAMPLES, 2))) df_obs, df_cf = sim.format_for_training(X, prop, cf) self.assertEqual(df_obs.shape, (self.X_NUM_SAMPLES, 7), msg="Generated dataset shape is {X} " "but supposed to be {supp}".format( X=df_obs.shape, supp=(self.X_NUM_SAMPLES, 7)))