def test_run(self, graph, schema): df = sem_generator( graph=graph, schema=schema, default_type="continuous", noise_std=1.0, n_samples=1000, intercept=False, seed=12, ) # test binary: assert df[0].nunique() == 2 assert df[0].nunique() == 2 # test categorical: for col in ["1_{}".format(i) for i in range(3)]: assert df[col].nunique() == 2 assert len([x for x in df.columns if isinstance(x, str) and "1_" in x]) == 3 for col in ["5_{}".format(i) for i in range(5)]: assert df[col].nunique() == 2 assert len([x for x in df.columns if isinstance(x, str) and "5_" in x]) == 5 # test continuous assert df[3].nunique() == 1000 assert df[4].nunique() == 1000
def generate_categorical_dataframe( sm: nx.DiGraph, n_samples: int, distribution: str = "logit", n_categories: int = 3, noise_scale: float = 1.0, intercept: bool = False, seed: int = None, kernel: Optional[Kernel] = None, ) -> pd.DataFrame: """ Generates a dataframe with samples from SEM with specified type of noise. Args: sm: A DAG in form of a networkx or StructureModel. Does not require weights. n_samples: The number of rows/observations to sample. kernel: A kernel from sklearn.gaussian_process.kernels like RBF(1) or Matern(1) or any combinations thereof. The kernels are used to create the latent variable for the binary / categorical variables and are directly used for continuous variables. distribution: The type of distribution to use for the noise of a variable. Options: 'probit'/'normal' (alias), "logit"/"gumbel" (alias). Logit is default. n_categories: Number of categories per variable/node. noise_scale: The standard deviation of the noise. The categorical features are created using a latent variable approach. The noise standard deviation determines how much weight the "mean" estimate has on the feature value. intercept: Whether to use an intercept for the latent variable of each feature. seed: Random state Returns: x_mat: [n_samples, d_nodes] sample matrix Raises: ValueError: if distribution is not 'probit', 'normal', 'logit', 'gumbel' """ if kernel is None: return sem_generator( graph=sm, default_type=f"categorical:{n_categories}", n_samples=n_samples, distributions={"categorical": distribution}, noise_std=noise_scale, intercept=intercept, seed=seed, ) return nonlinear_sem_generator( graph=sm, kernel=kernel, default_type=f"categorical:{n_categories}", n_samples=n_samples, distributions={"categorical": distribution}, noise_std=noise_scale, seed=seed, )
def generate_binary_data( sm: nx.DiGraph, n_samples: int, distribution: str = "logit", noise_scale: float = 1.0, intercept: bool = False, seed: int = None, kernel: Optional[Kernel] = None, ) -> np.ndarray: """ Simulate samples from SEM with specified type of noise. The order of the columns on the returned array is the one provided by `sm.nodes` Args: sm: A DAG in form of a networkx or StructureModel. Does not require weights. n_samples: The number of rows/observations to sample. kernel: A kernel from sklearn.gaussian_process.kernels like RBF(1) or Matern(1) or any combinations thereof. The kernels are used to create the latent variable for the binary / categorical variables and are directly used for continuous variables. distribution: The type of distribution to use for the noise of a variable. Options: 'probit'/'normal' (alias), 'logit' (default). noise_scale: The standard deviation of the noise. The binary and categorical features are created using a latent variable approach. The noise standard deviation determines how much weight the "mean" estimate has on the feature value. intercept: Whether to use an intercept for the latent variable of each feature. seed: Random state Returns: x_mat: [n_samples,d_nodes] sample matrix Raises: ValueError: if distribution isn't 'probit', 'normal', 'logit' """ if kernel is None: df = sem_generator( graph=sm, default_type="binary", n_samples=n_samples, distributions={"binary": distribution}, noise_std=noise_scale, intercept=intercept, seed=seed, ) else: df = nonlinear_sem_generator( graph=sm, kernel=kernel, default_type="binary", n_samples=n_samples, distributions={"binary": distribution}, noise_std=noise_scale, seed=seed, ) return df[list(sm.nodes())].values
def test_missing_default_type(self, graph): with pytest.raises(ValueError, match="Unknown default data type"): _ = sem_generator( graph=graph, schema=schema, default_type="unknown", noise_std=1.0, n_samples=1000, intercept=False, seed=12, )
def test_incorrect_intercept_dist(self, graph): with pytest.raises(ValueError, match="Unknown intercept distribution"): _ = sem_generator( graph=graph, schema=None, default_type="continuous", distributions={"intercept": "unknown"}, noise_std=2.0, n_samples=10, intercept=True, seed=10, )
def generate_continuous_dataframe( sm: nx.DiGraph, n_samples: int, distribution: str = "gaussian", noise_scale: float = 1.0, intercept: bool = False, seed: int = None, kernel: Optional[Kernel] = None, ) -> pd.DataFrame: """ Generates a dataframe with samples from SEM with specified type of noise. Args: sm: A DAG in form of a networkx or StructureModel. Does not require weights. n_samples: The number of rows/observations to sample. kernel: A kernel from sklearn.gaussian_process.kernels like RBF(1) or Matern(1) or any combinations thereof. The kernels are used to create the latent variable for the binary / categorical variables and are directly used for continuous variables. distribution: The type of distribution to use for the noise of a variable. Options: 'gaussian'/'normal' (alias), 'student-t', 'exponential', 'gumbel'. noise_scale: The standard deviation of the noise. intercept: Whether to use an intercept for each feature. seed: Random state Returns: Dataframe with the node names as column names Raises: ValueError: if distribution is not 'gaussian', 'normal', 'student-t', 'exponential', 'gumbel' """ if kernel is None: return sem_generator( graph=sm, default_type="continuous", n_samples=n_samples, distributions={"continuous": distribution}, noise_std=noise_scale, intercept=intercept, seed=seed, ) return nonlinear_sem_generator( graph=sm, kernel=kernel, default_type="continuous", n_samples=n_samples, distributions={"continuous": distribution}, noise_std=noise_scale, seed=seed, )
def test_not_permissible_type(self, graph): schema = { 0: "unknown data type", } with pytest.raises(ValueError, match="Unknown data type"): _ = sem_generator( graph=graph, schema=schema, default_type="continuous", noise_std=1.0, n_samples=1000, intercept=False, seed=12, )
def generate_count_dataframe( sm: nx.DiGraph, n_samples: int, zero_inflation_factor: float = 0.1, intercept: bool = False, seed: int = None, kernel: Optional[Kernel] = None, ) -> pd.DataFrame: """ Generates a dataframe with samples from SEM with specified type of noise. Args: sm: A DAG in form of a networkx or StructureModel. Does not require weights. n_samples: The number of rows/observations to sample. kernel: A kernel from sklearn.gaussian_process.kernels like RBF(1) or Matern(1) or any combinations thereof. The kernels are used to create the latent variable for the binary / categorical variables and are directly used for continuous variables. zero_inflation_factor: The probability of zero inflation for count data. intercept: Whether to use an intercept for the latent variable of each feature. seed: Random state Returns: x_mat: [n_samples, d_nodes] sample matrix Raises: ValueError: if ``zero_inflation_factor`` is not a float in [0, 1]. """ if kernel is None: return sem_generator( graph=sm, default_type="count", n_samples=n_samples, distributions={"count": zero_inflation_factor}, noise_std=1, # not used for poisson intercept=intercept, seed=seed, ) return nonlinear_sem_generator( graph=sm, kernel=kernel, default_type="count", n_samples=n_samples, distributions={"count": zero_inflation_factor}, noise_std=1, # not used for poisson seed=seed, )
def test_missing_cardinality(self, graph): schema = { 0: "categorical", 1: "categorical:3", 5: "categorical:5", } with pytest.raises(ValueError, match="Missing cardinality for categorical"): _ = sem_generator( graph=graph, schema=schema, default_type="continuous", noise_std=1.0, n_samples=1000, intercept=False, seed=12, )
def test_incorrect_weight_dist(self): sm = StructureModel() nodes = list(str(x) for x in range(6)) np.random.shuffle(nodes) sm.add_nodes_from(nodes) sm.add_weighted_edges_from([("0", "1", None), ("2", "4", None)]) with pytest.raises(ValueError, match="Unknown weight distribution"): _ = sem_generator( graph=sm, schema=None, default_type="continuous", distributions={"weight": "unknown"}, noise_std=2.0, n_samples=1000, intercept=False, seed=10, )
def test_mixed_type_independence(self, seed, n_categories, weight_distribution, intercept_distribution): """ Test whether the relation is accurate, implicitly tests sequence of nodes. """ np.random.seed(seed) sm = StructureModel() nodes = list(str(x) for x in range(6)) np.random.shuffle(nodes) sm.add_nodes_from(nodes) # binary -> categorical sm.add_weighted_edges_from([("0", "1", 10)]) # binary -> continuous sm.add_weighted_edges_from([("2", "4", None)]) schema = { "0": "binary", "1": "categorical:{}".format(n_categories), "2": "binary", "4": "continuous", "5": "categorical:{}".format(n_categories), } df = sem_generator( graph=sm, schema=schema, default_type="continuous", distributions={ "weight": weight_distribution, "intercept": intercept_distribution, }, noise_std=2, n_samples=100000, intercept=True, seed=seed, ) atol = 0.05 # 5% difference bewteen joint & factored! # 1. dependent links # 0 -> 1 (we look at the class with the highest deviation from uniform # to avoid small values) c, _ = max( [(c, np.abs(df["1_{}".format(c)].mean() - 1 / n_categories)) for c in range(n_categories)], key=operator.itemgetter(1), ) joint_proba, factored_proba = calculate_proba(df, "0", "1_{}".format(c)) assert not np.isclose(joint_proba, factored_proba, rtol=0, atol=atol) # 2 -> 4 assert not np.isclose( df["4"].mean(), df["4"][df["2"] == 1].mean(), rtol=0, atol=atol) tol = 0.15 # relative tolerance of +- 15% of the # 2. independent links # categorical c, _ = max( [(c, np.abs(df["1_{}".format(c)].mean() - 1 / n_categories)) for c in range(n_categories)], key=operator.itemgetter(1), ) joint_proba, factored_proba = calculate_proba(df, "0", "5_{}".format(c)) assert np.isclose(joint_proba, factored_proba, rtol=tol, atol=0) # binary joint_proba, factored_proba = calculate_proba(df, "0", "2") assert np.isclose(joint_proba, factored_proba, rtol=tol, atol=0) # categorical c, _ = max( [(c, np.abs(df["1_{}".format(c)].mean() - 1 / n_categories)) for c in range(n_categories)], key=operator.itemgetter(1), ) d, _ = max( [(d, np.abs(df["5_{}".format(d)].mean() - 1 / n_categories)) for d in range(n_categories)], key=operator.itemgetter(1), ) joint_proba, factored_proba = calculate_proba(df, "1_{}".format(d), "5_{}".format(c)) assert np.isclose(joint_proba, factored_proba, rtol=tol, atol=0) # continuous # for gaussian distributions, zero variance is equivalent to independence assert np.isclose(df[["3", "4"]].corr().values[0, 1], 0, atol=tol)
def test_graph_not_a_dag(self): graph = StructureModel() graph.add_edges_from([(0, 1), (1, 2), (2, 0)]) with pytest.raises(ValueError, match="Provided graph is not a DAG"): _ = sem_generator(graph=graph)