def test_missing_cardinality(self, schema):
        schema = {"new": "categorical:"}
        with pytest.raises(ValueError, match="Missing cardinality for categorical"):
            validate_schema(nodes={"new"}, schema=schema)

        schema = {"new": "categorical:01"}
        with pytest.raises(ValueError, match="Missing cardinality for categorical"):
            validate_schema(nodes={"new"}, schema=schema)

        schema = {"new": "categorical:100"}
        validate_schema(nodes={"new"}, schema=schema)
Beispiel #2
0
def _init_sem_data_gen(
    graph: nx.DiGraph,
    schema: Dict,
    n_samples: int,
    default_type: str,
    distributions: Dict[str, str],
    seed: int,
):
    np.random.seed(seed)

    if not nx.algorithms.is_directed_acyclic_graph(graph):
        raise ValueError("Provided graph is not a DAG.")

    distributions = _set_default_distributions(distributions=distributions)
    validated_schema = validate_schema(nodes=graph.nodes(),
                                       schema=schema,
                                       default_type=default_type)
    var_fte_mapper = VariableFeatureMapper(validated_schema)

    # pre-allocate array
    n_columns = var_fte_mapper.n_features
    x_mat = np.empty([n_samples, n_columns])

    return distributions, var_fte_mapper, x_mat
Beispiel #3
0
 def test_correct_schema(self, schema):
     new_schema = validate_schema(nodes=list(schema.keys()), schema=schema)
     assert new_schema == schema
Beispiel #4
0
 def test_imputation(self):
     default_schema = "continuous"
     new_schema = validate_schema(nodes=["new"],
                                  schema=None,
                                  default_type=default_schema)
     assert new_schema["new"] == default_schema
Beispiel #5
0
 def test_unknown_default_schema(self):
     with pytest.raises(ValueError, match="Unknown default data type"):
         validate_schema(nodes=["new"], schema={}, default_type="unknown")
Beispiel #6
0
 def test_unknown_data_type(self, schema):
     schema = {"new": "unknown"}
     with pytest.raises(ValueError, match="Unknown data type"):
         validate_schema(nodes={"new"}, schema=schema)
Beispiel #7
0
def sem_generator(
    graph: nx.DiGraph,
    schema: Optional[Dict] = None,
    default_type: str = "continuous",
    noise_std: float = 1.0,
    n_samples: int = 1000,
    distributions: Dict[str, str] = None,
    intercept: bool = True,
    seed: int = None,
) -> pd.DataFrame:
    """
    Generator for tabular data with mixed variable types from a DAG.

    Supported variable types: `'binary', 'categorical', 'continuous'`. The number
    of categories can be determined using a colon, e.g. `'categorical:5'`
    specifies a categorical feature with 5 categories.

    Notation: For binary and continuous variables, a ``variable'' refers to a
    ``node'', a ``feature'' refers to the one-hot column for categorical
    variables and is equivalent to a binary or continuous variable.

    Args:
        graph: A DAG in form of a networkx or StructureModel.
        schema: Dictionary with schema for a node/variable, if a node is missing
            uses ``default_type``. Format, {node_name: variable type}.
        default_type: The default data type for a node/variable not listed
            in the schema, or when the schema is empty.
        noise_std: The standard deviation of the noise. The binary and
            categorical features are created using a latent variable approach.
            The noise standard deviation determines how much weight the "mean"
            estimate has on the feature value.
        n_samples: The number of rows/observations to sample.
        distributions:
            ``continuous'': The type of distribution to use for the noise
                of a continuous variable. Options: 'gaussian'/'normal' (alias)
                (default), 'student-t', 'exponential', 'gumbel'.
            ``binary'': The type of distribution to use for the noise
                of the latent binary variable. Options: 'probit'/'normal' (alias),
                'logit' (default).
            ``categorical'': The type of distribution to use for the noise
                of a latent continuous feature. Options: 'probit'/'normal' (alias),
                'logit'/'gumbel' (alias) (default).
            ``weight'': The type of distribution to use for the linear coefficients.
                Options: 'gaussian'/'normal' (alias), 'uniform' (default).
            ``intercept'': The type of distribution to use for the intercept. For
                binary/categorical: this is the mean in the latent space.
                Options: 'gaussian'/'normal' (alias), 'uniform' (default).
        intercept: Whether to use an intercept for each feature. The intercept
            is sampled once and held constant for all rows. For binary or
            categorical the intercept determines the class imbalance.
        seed: Random State

    Returns:
        DataFrame with generated features, uses a one-hot coding for
        categorical features.

    Raises:
        ValueError: if the graph is not a DAG.
        ValueError: if schema variable type is not in `'binary', 'categorical',
            'continuous', 'continuous:X` (for variables with X categories).
        ValueError: if distributions['continuous'] is not 'gaussian', 'normal', 'student-t',
            'exponential', 'gumbel'.
        ValueError: if distributions['binary'] is not 'probit', 'normal', 'logit'.
        ValueError: if distributions['categorical'] is not 'probit', 'normal', 'logit', 'gumbel'.
        ValueError: if distributions['weight'] is not 'normal' / 'gaussian' (alias), 'uniform'.
        ValueError: if distributions['intercept'] is not 'normal' / 'gaussian' (alias), 'uniform'.


    Example:
        sm = StructureModel()

        sm.add_edges_from([('A', 'C'), ('D', 'C'), ('E', 'D')])

        sm.add_nodes_from(['B', 'F'])

        schema = {'B': 'binary', 'C': 'categorical:5',
                  'E': 'binary', 'F': 'continuous'}

        df = sem_generator(sm, schema, noise_scale=1,
                          n_samples=10000,
                          intercept=True,
                          )
    """

    np.random.seed(seed)

    if not nx.algorithms.is_directed_acyclic_graph(graph):
        raise ValueError("Provided graph is not a DAG.")

    distributions = _set_default_distributions(distributions=distributions)
    validated_schema = validate_schema(
        nodes=graph.nodes(), schema=schema, default_type=default_type
    )
    var_fte_mapper = VariableFeatureMapper(validated_schema)

    n_columns = var_fte_mapper.n_features

    # get dependence based on edges in graph (not via adjacency matrix)
    w_mat = _create_weight_matrix(
        edges_w_weights=graph.edges(data="weight"),
        variable_to_indices_dict=var_fte_mapper.var_indices_dict,
        weight_distribution=distributions["weight"],
        intercept_distribution=distributions["intercept"],
        intercept=intercept,
    )

    # pre-allocate array
    x_mat = np.empty([n_samples, n_columns + 1 if intercept else n_columns])
    # intercept, append ones to the feature matrix
    if intercept:
        x_mat[:, -1] = 1

    # loop over sorted features according to ancestry (no parents first)
    for j_node in nx.topological_sort(graph):
        # all feature indices corresponding to the node/variable
        j_idx_list = var_fte_mapper.get_indices(j_node)

        # get all parent feature indices for the variable/node
        parents_idx = var_fte_mapper.get_indices(list(graph.predecessors(j_node)))
        if intercept:
            parents_idx += [n_columns]

        # continuous variable
        if var_fte_mapper.is_var_of_type(j_node, "continuous"):
            x_mat[:, j_idx_list[0]] = _add_continuous_noise(
                mean=x_mat[:, parents_idx].dot(w_mat[parents_idx, j_idx_list[0]]),
                distribution=distributions["continuous"],
                noise_std=noise_std,
            )

        # binary variable
        elif var_fte_mapper.is_var_of_type(j_node, "binary"):
            x_mat[:, j_idx_list[0]] = _sample_binary_from_latent(
                latent_mean=x_mat[:, parents_idx].dot(
                    w_mat[parents_idx, j_idx_list[0]]
                ),
                distribution=distributions["binary"],
                noise_std=noise_std,
            )

        # categorical variable
        elif var_fte_mapper.is_var_of_type(j_node, "categorical"):
            x_mat[:, j_idx_list] = _sample_categories_from_latent(
                latent_mean=x_mat[:, parents_idx].dot(
                    w_mat[np.ix_(parents_idx, j_idx_list)]
                ),
                distribution=distributions["categorical"],
                noise_std=noise_std,
            )

    return pd.DataFrame(
        x_mat[:, :-1] if intercept else x_mat, columns=var_fte_mapper.feature_list
    )