Exemple #1
0
    def test_run(self, graph, schema):
        df = sem_generator(
            graph=graph,
            schema=schema,
            default_type="continuous",
            noise_std=1.0,
            n_samples=1000,
            intercept=False,
            seed=12,
        )

        # test binary:
        assert df[0].nunique() == 2
        assert df[0].nunique() == 2

        # test categorical:
        for col in ["1_{}".format(i) for i in range(3)]:
            assert df[col].nunique() == 2
        assert len([x for x in df.columns
                    if isinstance(x, str) and "1_" in x]) == 3

        for col in ["5_{}".format(i) for i in range(5)]:
            assert df[col].nunique() == 2
        assert len([x for x in df.columns
                    if isinstance(x, str) and "5_" in x]) == 5

        # test continuous
        assert df[3].nunique() == 1000
        assert df[4].nunique() == 1000
Exemple #2
0
def generate_categorical_dataframe(
    sm: nx.DiGraph,
    n_samples: int,
    distribution: str = "logit",
    n_categories: int = 3,
    noise_scale: float = 1.0,
    intercept: bool = False,
    seed: int = None,
    kernel: Optional[Kernel] = None,
) -> pd.DataFrame:
    """
    Generates a dataframe with samples from SEM with specified type of noise.

    Args:
        sm: A DAG in form of a networkx or StructureModel. Does not require weights.
        n_samples: The number of rows/observations to sample.
        kernel: A kernel from sklearn.gaussian_process.kernels like RBF(1) or
            Matern(1) or any combinations thereof. The kernels are used to
            create the latent variable for the binary / categorical variables
            and are directly used for continuous variables.
        distribution: The type of distribution to use for the noise
            of a variable. Options: 'probit'/'normal' (alias),
            "logit"/"gumbel" (alias). Logit is default.
        n_categories: Number of categories per variable/node.
        noise_scale: The standard deviation of the noise. The categorical features
            are created using a latent variable approach. The noise standard
            deviation determines how much weight the "mean" estimate has on
            the feature value.
        intercept: Whether to use an intercept for the latent variable of each feature.
        seed: Random state
    Returns:
        x_mat: [n_samples, d_nodes] sample matrix
    Raises:
        ValueError: if distribution is not 'probit', 'normal', 'logit', 'gumbel'
    """

    if kernel is None:
        return sem_generator(
            graph=sm,
            default_type=f"categorical:{n_categories}",
            n_samples=n_samples,
            distributions={"categorical": distribution},
            noise_std=noise_scale,
            intercept=intercept,
            seed=seed,
        )

    return nonlinear_sem_generator(
        graph=sm,
        kernel=kernel,
        default_type=f"categorical:{n_categories}",
        n_samples=n_samples,
        distributions={"categorical": distribution},
        noise_std=noise_scale,
        seed=seed,
    )
Exemple #3
0
def generate_binary_data(
    sm: nx.DiGraph,
    n_samples: int,
    distribution: str = "logit",
    noise_scale: float = 1.0,
    intercept: bool = False,
    seed: int = None,
    kernel: Optional[Kernel] = None,
) -> np.ndarray:
    """
    Simulate samples from SEM with specified type of noise.
    The order of the columns on the returned array is the one provided by `sm.nodes`

    Args:
        sm: A DAG in form of a networkx or StructureModel. Does not require weights.
        n_samples: The number of rows/observations to sample.
        kernel: A kernel from sklearn.gaussian_process.kernels like RBF(1) or
            Matern(1) or any combinations thereof. The kernels are used to
            create the latent variable for the binary / categorical variables
            and are directly used for continuous variables.
        distribution: The type of distribution to use for the noise
            of a variable. Options: 'probit'/'normal' (alias),
            'logit' (default).
        noise_scale: The standard deviation of the noise. The binary and
            categorical features are created using a latent variable approach.
            The noise standard deviation determines how much weight the "mean"
            estimate has on the feature value.
        intercept: Whether to use an intercept for the latent variable of each feature.
        seed: Random state
    Returns:
        x_mat: [n_samples,d_nodes] sample matrix
    Raises:
        ValueError: if distribution isn't 'probit', 'normal', 'logit'
    """
    if kernel is None:
        df = sem_generator(
            graph=sm,
            default_type="binary",
            n_samples=n_samples,
            distributions={"binary": distribution},
            noise_std=noise_scale,
            intercept=intercept,
            seed=seed,
        )
    else:
        df = nonlinear_sem_generator(
            graph=sm,
            kernel=kernel,
            default_type="binary",
            n_samples=n_samples,
            distributions={"binary": distribution},
            noise_std=noise_scale,
            seed=seed,
        )
    return df[list(sm.nodes())].values
Exemple #4
0
 def test_missing_default_type(self, graph):
     with pytest.raises(ValueError, match="Unknown default data type"):
         _ = sem_generator(
             graph=graph,
             schema=schema,
             default_type="unknown",
             noise_std=1.0,
             n_samples=1000,
             intercept=False,
             seed=12,
         )
Exemple #5
0
 def test_incorrect_intercept_dist(self, graph):
     with pytest.raises(ValueError, match="Unknown intercept distribution"):
         _ = sem_generator(
             graph=graph,
             schema=None,
             default_type="continuous",
             distributions={"intercept": "unknown"},
             noise_std=2.0,
             n_samples=10,
             intercept=True,
             seed=10,
         )
Exemple #6
0
def generate_continuous_dataframe(
    sm: nx.DiGraph,
    n_samples: int,
    distribution: str = "gaussian",
    noise_scale: float = 1.0,
    intercept: bool = False,
    seed: int = None,
    kernel: Optional[Kernel] = None,
) -> pd.DataFrame:
    """
    Generates a dataframe with samples from SEM with specified type of noise.
    Args:
        sm: A DAG in form of a networkx or StructureModel. Does not require weights.
        n_samples: The number of rows/observations to sample.
        kernel: A kernel from sklearn.gaussian_process.kernels like RBF(1) or
            Matern(1) or any combinations thereof. The kernels are used to
            create the latent variable for the binary / categorical variables
            and are directly used for continuous variables.
        distribution: The type of distribution to use for the noise
            of a variable. Options: 'gaussian'/'normal' (alias), 'student-t',
            'exponential', 'gumbel'.
        noise_scale: The standard deviation of the noise.
        intercept: Whether to use an intercept for each feature.
        seed: Random state
    Returns:
        Dataframe with the node names as column names
    Raises:
        ValueError: if distribution is not 'gaussian', 'normal', 'student-t',
            'exponential', 'gumbel'
    """
    if kernel is None:
        return sem_generator(
            graph=sm,
            default_type="continuous",
            n_samples=n_samples,
            distributions={"continuous": distribution},
            noise_std=noise_scale,
            intercept=intercept,
            seed=seed,
        )

    return nonlinear_sem_generator(
        graph=sm,
        kernel=kernel,
        default_type="continuous",
        n_samples=n_samples,
        distributions={"continuous": distribution},
        noise_std=noise_scale,
        seed=seed,
    )
Exemple #7
0
 def test_not_permissible_type(self, graph):
     schema = {
         0: "unknown data type",
     }
     with pytest.raises(ValueError, match="Unknown data type"):
         _ = sem_generator(
             graph=graph,
             schema=schema,
             default_type="continuous",
             noise_std=1.0,
             n_samples=1000,
             intercept=False,
             seed=12,
         )
Exemple #8
0
def generate_count_dataframe(
    sm: nx.DiGraph,
    n_samples: int,
    zero_inflation_factor: float = 0.1,
    intercept: bool = False,
    seed: int = None,
    kernel: Optional[Kernel] = None,
) -> pd.DataFrame:
    """
    Generates a dataframe with samples from SEM with specified type of noise.

    Args:
        sm: A DAG in form of a networkx or StructureModel. Does not require weights.
        n_samples: The number of rows/observations to sample.
        kernel: A kernel from sklearn.gaussian_process.kernels like RBF(1) or
            Matern(1) or any combinations thereof. The kernels are used to
            create the latent variable for the binary / categorical variables
            and are directly used for continuous variables.
        zero_inflation_factor: The probability of zero inflation for count data.
        intercept: Whether to use an intercept for the latent variable of each feature.
        seed: Random state
    Returns:
        x_mat: [n_samples, d_nodes] sample matrix
    Raises:
        ValueError: if ``zero_inflation_factor`` is not a float in [0, 1].
    """

    if kernel is None:
        return sem_generator(
            graph=sm,
            default_type="count",
            n_samples=n_samples,
            distributions={"count": zero_inflation_factor},
            noise_std=1,  # not used for poisson
            intercept=intercept,
            seed=seed,
        )

    return nonlinear_sem_generator(
        graph=sm,
        kernel=kernel,
        default_type="count",
        n_samples=n_samples,
        distributions={"count": zero_inflation_factor},
        noise_std=1,  # not used for poisson
        seed=seed,
    )
Exemple #9
0
 def test_missing_cardinality(self, graph):
     schema = {
         0: "categorical",
         1: "categorical:3",
         5: "categorical:5",
     }
     with pytest.raises(ValueError,
                        match="Missing cardinality for categorical"):
         _ = sem_generator(
             graph=graph,
             schema=schema,
             default_type="continuous",
             noise_std=1.0,
             n_samples=1000,
             intercept=False,
             seed=12,
         )
Exemple #10
0
    def test_incorrect_weight_dist(self):
        sm = StructureModel()
        nodes = list(str(x) for x in range(6))
        np.random.shuffle(nodes)
        sm.add_nodes_from(nodes)

        sm.add_weighted_edges_from([("0", "1", None), ("2", "4", None)])

        with pytest.raises(ValueError, match="Unknown weight distribution"):
            _ = sem_generator(
                graph=sm,
                schema=None,
                default_type="continuous",
                distributions={"weight": "unknown"},
                noise_std=2.0,
                n_samples=1000,
                intercept=False,
                seed=10,
            )
Exemple #11
0
    def test_mixed_type_independence(self, seed, n_categories,
                                     weight_distribution,
                                     intercept_distribution):
        """
        Test whether the relation is accurate, implicitly tests sequence of
        nodes.
        """
        np.random.seed(seed)

        sm = StructureModel()
        nodes = list(str(x) for x in range(6))
        np.random.shuffle(nodes)
        sm.add_nodes_from(nodes)
        # binary -> categorical
        sm.add_weighted_edges_from([("0", "1", 10)])
        # binary -> continuous
        sm.add_weighted_edges_from([("2", "4", None)])

        schema = {
            "0": "binary",
            "1": "categorical:{}".format(n_categories),
            "2": "binary",
            "4": "continuous",
            "5": "categorical:{}".format(n_categories),
        }

        df = sem_generator(
            graph=sm,
            schema=schema,
            default_type="continuous",
            distributions={
                "weight": weight_distribution,
                "intercept": intercept_distribution,
            },
            noise_std=2,
            n_samples=100000,
            intercept=True,
            seed=seed,
        )

        atol = 0.05  # 5% difference bewteen joint & factored!
        # 1. dependent links
        # 0 -> 1 (we look at the class with the highest deviation from uniform
        # to avoid small values)
        c, _ = max(
            [(c, np.abs(df["1_{}".format(c)].mean() - 1 / n_categories))
             for c in range(n_categories)],
            key=operator.itemgetter(1),
        )
        joint_proba, factored_proba = calculate_proba(df, "0",
                                                      "1_{}".format(c))
        assert not np.isclose(joint_proba, factored_proba, rtol=0, atol=atol)
        # 2 -> 4
        assert not np.isclose(
            df["4"].mean(), df["4"][df["2"] == 1].mean(), rtol=0, atol=atol)

        tol = 0.15  # relative tolerance of +- 15% of the
        # 2. independent links
        # categorical
        c, _ = max(
            [(c, np.abs(df["1_{}".format(c)].mean() - 1 / n_categories))
             for c in range(n_categories)],
            key=operator.itemgetter(1),
        )
        joint_proba, factored_proba = calculate_proba(df, "0",
                                                      "5_{}".format(c))
        assert np.isclose(joint_proba, factored_proba, rtol=tol, atol=0)

        # binary
        joint_proba, factored_proba = calculate_proba(df, "0", "2")
        assert np.isclose(joint_proba, factored_proba, rtol=tol, atol=0)

        # categorical
        c, _ = max(
            [(c, np.abs(df["1_{}".format(c)].mean() - 1 / n_categories))
             for c in range(n_categories)],
            key=operator.itemgetter(1),
        )
        d, _ = max(
            [(d, np.abs(df["5_{}".format(d)].mean() - 1 / n_categories))
             for d in range(n_categories)],
            key=operator.itemgetter(1),
        )
        joint_proba, factored_proba = calculate_proba(df, "1_{}".format(d),
                                                      "5_{}".format(c))
        assert np.isclose(joint_proba, factored_proba, rtol=tol, atol=0)

        # continuous
        # for gaussian distributions, zero variance is equivalent to independence
        assert np.isclose(df[["3", "4"]].corr().values[0, 1], 0, atol=tol)
Exemple #12
0
    def test_graph_not_a_dag(self):
        graph = StructureModel()
        graph.add_edges_from([(0, 1), (1, 2), (2, 0)])

        with pytest.raises(ValueError, match="Provided graph is not a DAG"):
            _ = sem_generator(graph=graph)