def naive_bayes_plus_parents( categories: int = 3, samples: int = 500, parents: int = 3, children: int = 3, p_z: float = 0.9, p_c: float = 0.9, percentage_not_missing: float = 0, seed: int = 22, ) -> Tuple[pd.DataFrame, StructureModel, Dict, np.array]: """ p0 ... pn \\ | / z / | \\ c0 ... cm z = mode of parents with probability p_z, otherwise mode of parents + 1 mod n_categories c0 = z with prob. p_c, otherwise it is z + 1 mod n_categories if no p are give, sample z from the categories uniformly Args: categories: number of categories samples: number of samples parents: number of parents, n as shown above children: number of children, m as above p_z: probability that z = mode(parents) p_c: probability that children equals parent percentage_not_missing: percentage of the LV that is provided. The default is 0, i.e. the LV is not observed seed: seed for random generator Returns: data: sampled pandas dataframe, missing data on z sm: structure model node_states: dictionary of list of states for each node true_lv_values: true values of latent variable """ def mode(lst: Iterable) -> Any: return Counter(lst).most_common()[0][0] if len(lst) > 0 else np.nan np.random.seed(seed) par_samples = np.random.choice(categories, size=[samples, parents]) if parents == 0: true_lv_values = np.random.choice(categories, size=[samples, 1]) else: true_lv_values = np.array( [ [(mode(el) + np.random.choice(2, p=[p_z, 1 - p_z])) % categories] for el in par_samples ] ) child_samples = np.random.random(size=[samples, children]) aux = true_lv_values.repeat(children, axis=1) child_samples = np.where(child_samples < p_c, aux, (aux + 1) % categories) df = pd.concat( [ pd.DataFrame(par_samples, columns=[f"p_{i}" for i in range(parents)]), pd.DataFrame(child_samples, columns=[f"c_{i}" for i in range(children)]), pd.DataFrame(true_lv_values, columns=["z"]), ], axis=1, ) df.loc[int(samples * percentage_not_missing) :, "z"] = np.nan sm = StructureModel() sm.add_edges_from([(f"p_{i}", "z") for i in range(parents)]) sm.add_edges_from([("z", f"c_{i}") for i in range(children)]) node_states = {"z": list(range(categories))} for i in range(parents): node_states[f"p_{i}"] = list(range(categories)) for i in range(children): node_states[f"c_{i}"] = list(range(categories)) return df, sm, node_states, true_lv_values
def test_graph_not_a_dag(self): graph = StructureModel() graph.add_edges_from([(0, 1), (1, 2), (2, 0)]) with pytest.raises(ValueError, match="Provided graph is not a DAG"): _ = sem_generator(graph=graph, seed=42)