Esempio n. 1
0
def naive_bayes_plus_parents(
    categories: int = 3,
    samples: int = 500,
    parents: int = 3,
    children: int = 3,
    p_z: float = 0.9,
    p_c: float = 0.9,
    percentage_not_missing: float = 0,
    seed: int = 22,
) -> Tuple[pd.DataFrame, StructureModel, Dict, np.array]:
    """
    p0 ... pn
     \\  |  /
        z
     /  |  \\
    c0 ... cm

    z = mode of parents with probability p_z, otherwise mode of parents + 1 mod n_categories
    c0 = z with prob. p_c, otherwise it is z + 1 mod n_categories
    if no p are give, sample z from the categories uniformly

    Args:
        categories: number of categories
        samples: number of samples
        parents: number of parents, n as shown above
        children: number of children, m as above
        p_z: probability that z = mode(parents)
        p_c: probability that children equals parent
        percentage_not_missing: percentage of the LV that is provided. The default is 0, i.e. the LV is not observed
        seed: seed for random generator

    Returns:
        data: sampled pandas dataframe, missing data on z
        sm: structure model
        node_states: dictionary of list of states for each node
        true_lv_values: true values of latent variable
    """

    def mode(lst: Iterable) -> Any:
        return Counter(lst).most_common()[0][0] if len(lst) > 0 else np.nan

    np.random.seed(seed)
    par_samples = np.random.choice(categories, size=[samples, parents])

    if parents == 0:
        true_lv_values = np.random.choice(categories, size=[samples, 1])
    else:
        true_lv_values = np.array(
            [
                [(mode(el) + np.random.choice(2, p=[p_z, 1 - p_z])) % categories]
                for el in par_samples
            ]
        )

    child_samples = np.random.random(size=[samples, children])
    aux = true_lv_values.repeat(children, axis=1)
    child_samples = np.where(child_samples < p_c, aux, (aux + 1) % categories)

    df = pd.concat(
        [
            pd.DataFrame(par_samples, columns=[f"p_{i}" for i in range(parents)]),
            pd.DataFrame(child_samples, columns=[f"c_{i}" for i in range(children)]),
            pd.DataFrame(true_lv_values, columns=["z"]),
        ],
        axis=1,
    )
    df.loc[int(samples * percentage_not_missing) :, "z"] = np.nan

    sm = StructureModel()
    sm.add_edges_from([(f"p_{i}", "z") for i in range(parents)])
    sm.add_edges_from([("z", f"c_{i}") for i in range(children)])

    node_states = {"z": list(range(categories))}

    for i in range(parents):
        node_states[f"p_{i}"] = list(range(categories))
    for i in range(children):
        node_states[f"c_{i}"] = list(range(categories))

    return df, sm, node_states, true_lv_values
Esempio n. 2
0
    def test_graph_not_a_dag(self):
        graph = StructureModel()
        graph.add_edges_from([(0, 1), (1, 2), (2, 0)])

        with pytest.raises(ValueError, match="Provided graph is not a DAG"):
            _ = sem_generator(graph=graph, seed=42)