コード例 #1
0
ファイル: test_metrics.py プロジェクト: zeta1999/causalnex
    def test_auc_node_with_no_parents(self):
        """Should be possible to compute auc for state with no parent nodes"""

        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(1)] + [[a, b, 1] for a in range(0, 2)
                                   for b in range(0, 2)
                                   for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(1)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        _, auc = roc_auc(bn, train, "a")
        assert math.isclose(auc, 0.5, abs_tol=0.01)
コード例 #2
0
ファイル: test_metrics.py プロジェクト: zeta1999/causalnex
    def test_auc_of_accurate_predictions(self):
        """AUC of accurate predictions should be 1"""

        # equal class (c) weighting to guarantee high ROC expected
        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(1)] + [[a, b, 1] for a in range(0, 2)
                                   for b in range(0, 2)
                                   for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(1)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        _, auc = roc_auc(bn, train, "c")
        assert math.isclose(auc, 1, abs_tol=0.001)
コード例 #3
0
ファイル: test_metrics.py プロジェクト: zeta1999/causalnex
    def test_auc_with_missing_state_in_test(self):
        """AUC should still be calculated correctly with states missing in test set"""

        # equal class (c) weighting to guarantee high ROC expected
        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(1)] + [[a, b, 1] for a in range(0, 2)
                                   for b in range(0, 2)
                                   for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(1)],
            columns=["a", "b", "c"],
        )

        test = train[train["c"] == 1]
        assert len(test["c"].unique()) == 1

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        _, auc = roc_auc(bn, test, "c")
        assert math.isclose(auc, 1, abs_tol=0.01)
コード例 #4
0
ファイル: test_metrics.py プロジェクト: zeta1999/causalnex
    def test_roc_of_accurate_predictions(self):
        """TPR should always be better than FPR for accurate predictions"""

        # equal class (c) weighting to guarantee high ROC expected
        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(10)] + [[a, b, 1] for a in range(0, 2)
                                    for b in range(0, 2)
                                    for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(10)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        roc, _ = roc_auc(bn, train, "c")
        assert all(tpr > fpr for fpr, tpr in roc if tpr not in [0.0, 1.0])
コード例 #5
0
ファイル: test_metrics.py プロジェクト: zeta1999/causalnex
    def test_roc_of_random_has_unit_gradient(self):
        """The ROC curve for random predictions should be a line from (0,0) to (1,1)"""

        # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold
        # points in roc curve)
        train = pd.DataFrame(
            [[a, b, 0] for a in range(3) for b in range(3)
             for _ in range(1)] + [[a, b, 1] for a in range(3)
                                   for b in range(3)
                                   for _ in range(a * 1000 + b * 1000 + 1000)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02)

        test = pd.DataFrame(
            [[a, b, random.randint(0, 1)] for a in range(3) for b in range(3)
             for _ in range(1000)],
            columns=["a", "b", "c"],
        )

        roc, _ = roc_auc(bn, test, "c")

        assert len(roc) > 3
        assert all(math.isclose(a, b, abs_tol=0.03) for a, b in roc)
コード例 #6
0
ファイル: test_metrics.py プロジェクト: zeta1999/causalnex
    def test_auc_of_incorrect_close_to_zero(self):
        """The AUC of incorrect predictions should be close to zero"""

        # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold
        # points in roc curve)
        train = pd.DataFrame(
            [[a, b, 0] for a in range(3) for b in range(3)
             for _ in range(1)] + [[a, b, 1] for a in range(3)
                                   for b in range(3)
                                   for _ in range(a * 1000 + b * 1000 + 1000)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02)

        # in test, c=0 is always more likely (opposite of train)
        test = pd.DataFrame(
            [[a, b, 0] for a in range(3) for b in range(3)
             for _ in range(1000)] + [[a, b, 1] for a in range(3)
                                      for b in range(3) for _ in range(1)],
            columns=["a", "b", "c"],
        )

        _, auc = roc_auc(bn, test, "c")

        assert math.isclose(auc, 0, abs_tol=0.001)
コード例 #7
0
ファイル: test_metrics.py プロジェクト: zeta1999/causalnex
    def test_roc_of_incorrect_has_fpr_lt_tpr(self):
        """The ROC of incorrect predictions should have FPR < TPR"""

        # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold
        # points in roc curve)
        train = pd.DataFrame(
            [[a, b, 0] for a in range(3) for b in range(3)
             for _ in range(1)] + [[a, b, 1] for a in range(3)
                                   for b in range(3)
                                   for _ in range(a * 1000 + b * 1000 + 1000)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02)

        # in test, c=0 is always more likely (opposite of train)
        test = pd.DataFrame(
            [[a, b, 0] for a in range(3) for b in range(3)
             for _ in range(1000)] + [[a, b, 1] for a in range(3)
                                      for b in range(3) for _ in range(1)],
            columns=["a", "b", "c"],
        )

        roc, _ = roc_auc(bn, test, "c")

        assert len(roc) > 3
        assert all(fpr > tpr for fpr, tpr in roc if tpr not in [0.0, 1.0])
コード例 #8
0
ファイル: test_metrics.py プロジェクト: zeta1999/causalnex
    def test_auc_for_nonnumeric_features(self):
        """AUC of accurate predictions should be 1 even after remapping numbers to strings"""

        # equal class (c) weighting to guarantee high ROC expected
        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(1)] + [[a, b, 1] for a in range(0, 2)
                                   for b in range(0, 2)
                                   for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(1)],
            columns=["a", "b", "c"],
        )

        # remap values in column c
        train["c"] = train["c"].map({0: "f", 1: "g"})

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        _, auc = roc_auc(bn, train, "c")
        assert math.isclose(auc, 1, abs_tol=0.001)
コード例 #9
0
    def test_number_of_nodes(self, num_nodes):
        """ Length of each row in generated data equals num_nodes """
        graph = StructureModel()
        edges = [(n, n + 1, 1) for n in range(num_nodes - 1)]
        graph.add_weighted_edges_from(edges)

        data = generate_binary_data(graph, 100, seed=10)
        assert all(len(sample) == num_nodes for sample in data)
コード例 #10
0
    def test_number_of_columns(self, num_nodes, n_categories):
        """ Length of dataframe is in the correct shape"""
        graph = StructureModel()
        edges = [(n, n + 1, 1) for n in range(num_nodes - 1)]
        graph.add_weighted_edges_from(edges)

        data = generate_categorical_dataframe(graph,
                                              100,
                                              seed=10,
                                              n_categories=n_categories)
        assert data.shape[1] == (num_nodes * n_categories)
コード例 #11
0
    def generator(num_nodes, seed, weight=None):
        np.random.seed(seed)

        sm = StructureModel()
        nodes = list("".join(x) for x in product(
            string.ascii_lowercase, string.ascii_lowercase))[:num_nodes]
        np.random.shuffle(nodes)
        sm.add_nodes_from(nodes)

        # one edge:
        sm.add_weighted_edges_from([("aa", "ab", weight)])
        return sm
コード例 #12
0
    def test_incorrect_weight_dist(self):
        sm = StructureModel()
        nodes = list(str(x) for x in range(6))
        np.random.shuffle(nodes)
        sm.add_nodes_from(nodes)

        sm.add_weighted_edges_from([("0", "1", None), ("2", "4", None)])

        with pytest.raises(ValueError, match="Unknown weight distribution"):
            _ = sem_generator(
                graph=sm,
                schema=None,
                default_type="continuous",
                distributions={"weight": "unknown"},
                noise_std=2.0,
                n_samples=1000,
                intercept=False,
                seed=10,
            )
コード例 #13
0
def from_pandas_lasso(
    X: pd.DataFrame,
    beta: float,
    max_iter: int = 100,
    h_tol: float = 1e-8,
    w_threshold: float = 0.0,
    tabu_edges: List[Tuple[str, str]] = None,
    tabu_parent_nodes: List[str] = None,
    tabu_child_nodes: List[str] = None,
) -> StructureModel:
    """
    Learn the `StructureModel`, the graph structure with lasso regularisation
    describing conditional dependencies between variables in data presented as a pandas dataframe.

    Based on DAGs with NO TEARS.
    @inproceedings{zheng2018dags,
        author = {Zheng, Xun and Aragam, Bryon and Ravikumar, Pradeep and Xing, Eric P.},
        booktitle = {Advances in Neural Information Processing Systems},
        title = {{DAGs with NO TEARS: Continuous Optimization for Structure Learning}},
        year = {2018},
        codebase = {https://github.com/xunzheng/notears}
    }

    Args:
        X: input data.
        beta: Constant that multiplies the lasso term.
        max_iter: max number of dual ascent steps during optimisation.
        h_tol: exit if h(W) < h_tol (as opposed to strict definition of 0).
        w_threshold: fixed threshold for absolute edge weights.
        tabu_edges: list of edges(from, to) not to be included in the graph.
        tabu_parent_nodes: list of nodes banned from being a parent of any other nodes.
        tabu_child_nodes: list of nodes banned from being a child of any other nodes.

    Returns:
         StructureModel: graph of conditional dependencies between data variables.

    Raises:
        ValueError: If X does not contain data.
    """

    data = deepcopy(X)

    non_numeric_cols = data.select_dtypes(exclude="number").columns

    if not non_numeric_cols.empty:
        raise ValueError(
            "All columns must have numeric data. "
            "Consider mapping the following columns to int {non_numeric_cols}".
            format(non_numeric_cols=non_numeric_cols))

    col_idx = {c: i for i, c in enumerate(data.columns)}
    idx_col = {i: c for c, i in col_idx.items()}

    if tabu_edges:
        tabu_edges = [(col_idx[u], col_idx[v]) for u, v in tabu_edges]
    if tabu_parent_nodes:
        tabu_parent_nodes = [col_idx[n] for n in tabu_parent_nodes]
    if tabu_child_nodes:
        tabu_child_nodes = [col_idx[n] for n in tabu_child_nodes]

    g = from_numpy_lasso(
        data.values,
        beta,
        max_iter,
        h_tol,
        w_threshold,
        tabu_edges,
        tabu_parent_nodes,
        tabu_child_nodes,
    )

    sm = StructureModel()
    sm.add_nodes_from(data.columns)
    sm.add_weighted_edges_from(
        [(idx_col[u], idx_col[v], w) for u, v, w in g.edges.data("weight")],
        origin="learned",
    )

    return sm
コード例 #14
0
def from_pandas(
    X: pd.DataFrame,
    max_iter: int = 100,
    h_tol: float = 1e-8,
    w_threshold: float = 0.0,
    tabu_edges: List[Tuple[str, str]] = None,
    tabu_parent_nodes: List[str] = None,
    tabu_child_nodes: List[str] = None,
) -> StructureModel:
    """
    Learn the `StructureModel`, the graph structure describing conditional dependencies between variables
    in data presented as a pandas dataframe.

    The optimisation is to minimise a score function :math:`F(W)` over the graph's
    weighted adjacency matrix, :math:`W`, subject to the a constraint function :math:`h(W)`,
    where :math:`h(W) == 0` characterises an acyclic graph.
    :math:`h(W) > 0` is a continuous, differentiable function that encapsulated how acyclic the graph is
    (less == more acyclic).
    Full details of this approach to structure learning are provided in the publication:

    Based on DAGs with NO TEARS.
    @inproceedings{zheng2018dags,
        author = {Zheng, Xun and Aragam, Bryon and Ravikumar, Pradeep and Xing, Eric P.},
        booktitle = {Advances in Neural Information Processing Systems},
        title = {{DAGs with NO TEARS: Continuous Optimization for Structure Learning}},
        year = {2018},
        codebase = {https://github.com/xunzheng/notears}
    }

    Args:
        X: input data.
        max_iter: max number of dual ascent steps during optimisation.
        h_tol: exit if h(W) < h_tol (as opposed to strict definition of 0).
        w_threshold: fixed threshold for absolute edge weights.
        tabu_edges: list of edges(from, to) not to be included in the graph.
        tabu_parent_nodes: list of nodes banned from being a parent of any other nodes.
        tabu_child_nodes: list of nodes banned from being a child of any other nodes.

    Returns:
         StructureModel: graph of conditional dependencies between data variables.

    Raises:
        ValueError: If X does not contain data.
    """

    data = deepcopy(X)

    non_numeric_cols = data.select_dtypes(exclude="number").columns

    if len(non_numeric_cols) > 0:
        raise ValueError(
            "All columns must have numeric data. "
            "Consider mapping the following columns to int {non_numeric_cols}".
            format(non_numeric_cols=non_numeric_cols))

    col_idx = {c: i for i, c in enumerate(data.columns)}
    idx_col = {i: c for c, i in col_idx.items()}

    if tabu_edges:
        tabu_edges = [(col_idx[u], col_idx[v]) for u, v in tabu_edges]
    if tabu_parent_nodes:
        tabu_parent_nodes = [col_idx[n] for n in tabu_parent_nodes]
    if tabu_child_nodes:
        tabu_child_nodes = [col_idx[n] for n in tabu_child_nodes]

    g = from_numpy(
        data.values,
        max_iter,
        h_tol,
        w_threshold,
        tabu_edges,
        tabu_parent_nodes,
        tabu_child_nodes,
    )

    sm = StructureModel()
    sm.add_nodes_from(data.columns)
    sm.add_weighted_edges_from(
        [(idx_col[u], idx_col[v], w) for u, v, w in g.edges.data("weight")],
        origin="learned",
    )

    return sm
コード例 #15
0
def generate_structure_dynamic(  # pylint: disable=too-many-arguments
    num_nodes: int,
    p: int,
    degree_intra: float,
    degree_inter: float,
    graph_type_intra: str = "erdos-renyi",
    graph_type_inter: str = "erdos-renyi",
    w_min_intra: float = 0.5,
    w_max_intra: float = 0.5,
    w_min_inter: float = 0.5,
    w_max_inter: float = 0.5,
    w_decay: float = 1.0,
) -> StructureModel:
    """
    Generates a dynamic DAG at random.

    Args:
        num_nodes: Number of nodes
        p: maximum lag to be considered in the structure
        degree_intra: expected degree on nodes from the current state
        degree_inter: expected degree on nodes from the lagged nodes
        graph_type_intra:
            - erdos-renyi: constructs a graph such that the probability of any given edge is degree / (num_nodes - 1)
            - barabasi-albert: constructs a scale-free graph from an initial connected graph of (degree / 2) nodes
            - full: constructs a fully-connected graph - degree has no effect
        graph_type_inter:
            - erdos-renyi: constructs a graph such that the probability of any given edge is degree / (num_nodes - 1)
            - full: connect all past nodes to all present nodes
        w_min_intra: minimum weight for intra-slice nodes
        w_max_intra: maximum weight for intra-slice nodes
        w_min_inter: minimum weight for inter-slice nodes
        w_max_inter: maximum weight for inter-slice nodes
        w_decay: exponent of weights decay for slices that are farther apart. Default is 1.0, which implies no decay

    Raises:
        ValueError: if graph type unknown or `num_nodes < 2`

    Returns:
        StructureModel containing all simulated nodes and edges (intra- and inter-slice)
    """
    sm_intra = generate_structure(
        num_nodes=num_nodes,
        degree=degree_intra,
        graph_type=graph_type_intra,
        w_min=w_min_intra,
        w_max=w_max_intra,
    )
    sm_inter = _generate_inter_structure(
        num_nodes=num_nodes,
        p=p,
        degree=degree_inter,
        graph_type=graph_type_inter,
        w_min=w_min_inter,
        w_max=w_max_inter,
        w_decay=w_decay,
    )
    res = StructureModel()
    res.add_nodes_from(sm_inter.nodes)
    res.add_nodes_from([f"{u}_lag0" for u in sm_intra.nodes])
    res.add_weighted_edges_from(sm_inter.edges.data("weight"))
    res.add_weighted_edges_from([(f"{u}_lag0", f"{v}_lag0", w)
                                 for u, v, w in sm_intra.edges.data("weight")])
    return res
コード例 #16
0
def graph():
    graph = StructureModel()
    edges = [(n, n + 1, 1) for n in range(5)]
    graph.add_weighted_edges_from(edges)
    return graph
コード例 #17
0
    def test_mixed_type_independence(self, seed, n_categories,
                                     weight_distribution,
                                     intercept_distribution):
        """
        Test whether the relation is accurate, implicitly tests sequence of
        nodes.
        """
        np.random.seed(seed)

        sm = StructureModel()
        nodes = list(str(x) for x in range(6))
        np.random.shuffle(nodes)
        sm.add_nodes_from(nodes)
        # binary -> categorical
        sm.add_weighted_edges_from([("0", "1", 10)])
        # binary -> continuous
        sm.add_weighted_edges_from([("2", "4", None)])
        # binary -> count
        sm.add_weighted_edges_from([("2", "6", 100)])

        schema = {
            "0": "binary",
            "1": "categorical:{}".format(n_categories),
            "2": "binary",
            "4": "continuous",
            "5": "categorical:{}".format(n_categories),
            "6": "count",
        }

        df = sem_generator(
            graph=sm,
            schema=schema,
            default_type="continuous",
            distributions={
                "weight": weight_distribution,
                "intercept": intercept_distribution,
                "count": 0.05,
            },
            noise_std=2,
            n_samples=100000,
            intercept=True,
            seed=seed,
        )

        atol = 0.05  # 5% difference bewteen joint & factored!
        # 1. dependent links
        # 0 -> 1 (we look at the class with the highest deviation from uniform
        # to avoid small values)
        c, _ = max(
            [(c, np.abs(df["1_{}".format(c)].mean() - 1 / n_categories))
             for c in range(n_categories)],
            key=operator.itemgetter(1),
        )
        joint_proba, factored_proba = calculate_proba(df, "0",
                                                      "1_{}".format(c))
        assert not np.isclose(joint_proba, factored_proba, rtol=0, atol=atol)
        # 2 -> 4
        assert not np.isclose(
            df["4"].mean(), df["4"][df["2"] == 1].mean(), rtol=0, atol=atol)
        # binary on count
        assert not np.isclose(
            df.loc[df["2"] == 0, "6"].mean(),
            df.loc[df["2"] == 1, "6"].mean(),
            rtol=0,
            atol=atol,
        )

        tol = 0.15  # relative tolerance of +- 15% of the
        # 2. independent links
        # categorical
        c, _ = max(
            [(c, np.abs(df["1_{}".format(c)].mean() - 1 / n_categories))
             for c in range(n_categories)],
            key=operator.itemgetter(1),
        )
        joint_proba, factored_proba = calculate_proba(df, "0",
                                                      "5_{}".format(c))
        assert np.isclose(joint_proba, factored_proba, rtol=tol, atol=0)

        # binary
        joint_proba, factored_proba = calculate_proba(df, "0", "2")
        assert np.isclose(joint_proba, factored_proba, rtol=tol, atol=0)

        # categorical
        c, _ = max(
            [(c, np.abs(df["1_{}".format(c)].mean() - 1 / n_categories))
             for c in range(n_categories)],
            key=operator.itemgetter(1),
        )
        d, _ = max(
            [(d, np.abs(df["5_{}".format(d)].mean() - 1 / n_categories))
             for d in range(n_categories)],
            key=operator.itemgetter(1),
        )
        joint_proba, factored_proba = calculate_proba(df, "1_{}".format(d),
                                                      "5_{}".format(c))
        assert np.isclose(joint_proba, factored_proba, rtol=tol, atol=0)

        # continuous
        # for gaussian distributions, zero variance is equivalent to independence
        assert np.isclose(df[["3", "4"]].corr().values[0, 1], 0, atol=tol)