コード例 #1
0
    def test_empty_data_raises_error(self):
        """
        Providing an empty data set should result in a Value Error explaining that data must not be empty.
        This error is useful to catch and handle gracefully, because otherwise the user would experience
        misleading division by zero, or unpacking errors.
        """

        with pytest.raises(ValueError):
            from_pandas(pd.DataFrame(data=[], columns=["a"]))
コード例 #2
0
 def test_check_array(self, data):
     """
     Providing a data set including nan or inf should result in a Value Error explaining that data contains nan.
     This error is useful to catch and handle gracefully, because otherwise the user would have empty structures.
     """
     with pytest.raises(
         ValueError,
         match="Input contains NaN, infinity or a value too large for dtype*",
     ):
         from_pandas(pd.DataFrame(data=data, columns=["a"]))
コード例 #3
0
    def test_f1score_generated(self, adjacency_mat_num_stability):
        """Structure learnt from regularisation should have very high f1 score relative to the ground truth"""
        df = pd.DataFrame(
            adjacency_mat_num_stability,
            columns=["a", "b", "c", "d", "e"],
            index=["a", "b", "c", "d", "e"],
        )
        train_model = StructureModel(df)
        X = generate_continuous_dataframe(train_model,
                                          50,
                                          noise_scale=1,
                                          seed=1)
        g = from_pandas(X, lasso_beta=0.1, w_threshold=0.25)
        right_edges = train_model.edges

        n_predictions_made = len(g.edges)
        n_correct_predictions = len(
            set(g.edges).intersection(set(right_edges)))
        n_relevant_predictions = len(right_edges)

        precision = n_correct_predictions / n_predictions_made
        recall = n_correct_predictions / n_relevant_predictions
        f1_score = 2 * (precision * recall) / (precision + recall)

        assert f1_score > 0.85
コード例 #4
0
    def test_no_cycles(self, train_data_idx):
        """
        The learned structure should be acyclic
        """

        g = from_pandas(train_data_idx, w_threshold=0.25)
        assert nx.algorithms.is_directed_acyclic_graph(g)
コード例 #5
0
    def test_f1score_generated_binary(self):
        """ Binary strucutre learned should have good f1 score """
        np.random.seed(10)
        sm = generate_structure(5, 2.0)
        df = generate_binary_dataframe(sm,
                                       1000,
                                       intercept=False,
                                       noise_scale=0.1,
                                       seed=10)

        dist_type_schema = {i: "bin" for i in range(df.shape[1])}
        sm_fitted = from_pandas(
            df,
            dist_type_schema=dist_type_schema,
            lasso_beta=0.1,
            ridge_beta=0.0,
            w_threshold=0.1,
            use_bias=False,
        )

        right_edges = sm.edges
        n_predictions_made = len(sm_fitted.edges)
        n_correct_predictions = len(
            set(sm_fitted.edges).intersection(set(right_edges)))
        n_relevant_predictions = len(right_edges)

        precision = n_correct_predictions / n_predictions_made
        recall = n_correct_predictions / n_relevant_predictions
        f1_score = 2 * (precision * recall) / (precision + recall)

        assert f1_score > 0.8
コード例 #6
0
    def fit(
        self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray]
    ) -> "DAGRegressor":
        """
        Fits the sm model using the concat of X and y.
        """

        # defensive X, y checks
        check_X_y(X, y, y_numeric=True)

        # force as DataFrame and Series (for later calculations)
        X = pd.DataFrame(X)
        y = pd.Series(y)
        # force name so that name != None (causes errors in notears)
        y.name = y.name or "__target"

        if self.standardize:
            self.ss_X = StandardScaler()
            self.ss_y = StandardScaler()
            X = pd.DataFrame(self.ss_X.fit_transform(X), columns=X.columns)
            y = pd.Series(
                self.ss_y.fit_transform(y.values.reshape(-1, 1)).reshape(-1),
                name=y.name,
            )

        # preserve the feature and target colnames
        self._features = tuple(X.columns)
        self._target = y.name

        # concat X and y along column axis
        X = pd.concat([X, y], axis=1)

        # make copy to prevent mutability
        tabu_parent_nodes = copy.deepcopy(self.tabu_parent_nodes)
        if self.dependent_target:
            if tabu_parent_nodes is None:
                tabu_parent_nodes = [self._target]
            elif self._target not in tabu_parent_nodes:
                tabu_parent_nodes.append(self._target)

        # fit the structured model
        self.graph_ = notears.from_pandas(
            X,
            lasso_beta=self.alpha,
            ridge_beta=self.beta,
            hidden_layer_units=self.hidden_layer_units,
            w_threshold=self.threshold,
            tabu_edges=self.tabu_edges,
            tabu_parent_nodes=tabu_parent_nodes,
            tabu_child_nodes=self.tabu_child_nodes,
            use_bias=self.fit_intercept,
            **self.kwargs
        )

        # keep thresholding until the DAG constraint is enforced
        if self.enforce_dag:
            self.graph_.threshold_till_dag()

        return self
コード例 #7
0
    def test_certain_relationships_get_near_certain_weight(self):
        """If observations reliably show a==b and !a==!b then the relationship from a->b should be certain"""

        data = pd.DataFrame([[0, 1] for _ in range(10)], columns=["a", "b"])
        g = from_pandas(data)
        assert all([
            0.99 <= weight <= 1 for u, v, weight in g.edges(data="weight")
            if u == 0 and v == 1
        ])
コード例 #8
0
    def test_inverse_relationships_get_negative_weight(self):
        """If observations indicate a==!b and b==!a then the weight of the relationship from a-> should be negative"""

        data = pd.DataFrame([[0, 1] for _ in range(10)], columns=["a", "b"])
        data.append(
            pd.DataFrame([[1, 0] for _ in range(10)], columns=["a", "b"]))
        g = from_pandas(data)
        assert all([
            weight < 0 for u, v, weight in g.edges(data="mean_effect")
            if u == 0 and v == 1
        ])
コード例 #9
0
    def test_f1_score_fixed(self, train_data_idx, train_model):
        """Structure learnt from regularisation should have very high f1 score relative to the ground truth"""
        g = from_pandas(train_data_idx, lasso_beta=0.01, w_threshold=0.25)

        n_predictions_made = len(g.edges)
        n_correct_predictions = len(set(g.edges).intersection(set(train_model.edges)))
        n_relevant_predictions = len(train_model.edges)

        precision = n_correct_predictions / n_predictions_made
        recall = n_correct_predictions / n_relevant_predictions
        f1_score = 2 * (precision * recall) / (precision + recall)

        assert f1_score > 0.8
コード例 #10
0
    def test_multiple_tabu(self, train_data_idx):
        """Any edge related to tabu edges/parent nodes/child nodes should not exist in the network"""

        tabu_e = [("d", "a"), ("b", "c")]
        tabu_p = ["b"]
        tabu_c = ["a", "d"]
        g = from_pandas(
            train_data_idx,
            tabu_edges=tabu_e,
            tabu_parent_nodes=tabu_p,
            tabu_child_nodes=tabu_c,
        )
        assert [e not in g.edges for e in tabu_e]
        assert [p not in [e[0] for e in g.edges] for p in tabu_p]
        assert [c not in [e[1] for e in g.edges] for c in tabu_c]
コード例 #11
0
    def test_sparsity(self, train_data_idx):
        """Structure learnt from larger lambda should be sparser than smaller lambda"""

        g1 = from_pandas(train_data_idx, lasso_beta=10.0, w_threshold=0.25)
        g2 = from_pandas(train_data_idx, lasso_beta=1e-6, w_threshold=0.25)
        assert len(g1.edges) < len(g2.edges)
コード例 #12
0
    def fit(self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series,
                                                               np.ndarray]):
        """
        Fits the sm model using the concat of X and y.
        """

        # defensive X, y checks
        check_X_y(X, y, y_numeric=True)

        # force X, y to DataFrame, Series for later calculations
        X = pd.DataFrame(X)
        y = pd.Series(y)
        # force name so that name != None (causes errors in notears)
        y.name = y.name or "__target"

        # if self.dist_type_schema is None, assume all columns are continuous
        # NOTE: this is copied due to later insertions
        dist_type_schema = copy.deepcopy(
            self.dist_type_schema) or {col: "cont"
                                       for col in X.columns}

        if self.standardize:
            # only standardize the continuous dist type columns.
            self.continuous_col_idxs = [
                X.columns.get_loc(col)
                for col, alias in dist_type_schema.items() if alias == "cont"
            ]

            # copy X to prevet changes to underlying array data
            X = X.copy()
            self._ss_X = StandardScaler()
            X.iloc[:, self.continuous_col_idxs] = self._ss_X.fit_transform(
                X.iloc[:, self.continuous_col_idxs])

            # if its a continuous target also standardize
            if self._target_dist_type == "cont":
                y = y.copy()
                self._ss_y = StandardScaler()
                y[:] = self._ss_y.fit_transform(y.values.reshape(
                    -1, 1)).reshape(-1)

        # add the target to the dist_type_schema
        # NOTE: this must be done AFTER standardize
        dist_type_schema[y.name] = self._target_dist_type

        # preserve the feature and target colnames
        self._features = tuple(X.columns)
        self._target = y.name

        # concat X and y along column axis
        X = pd.concat([X, y], axis=1)

        # make copy to prevent mutability
        tabu_parent_nodes = copy.deepcopy(self.tabu_parent_nodes)
        if self.dependent_target:
            if tabu_parent_nodes is None:
                tabu_parent_nodes = [self._target]
            elif self._target not in tabu_parent_nodes:
                tabu_parent_nodes.append(self._target)

        # fit the structured model
        self.graph_ = notears.from_pandas(
            X,
            dist_type_schema=dist_type_schema,
            lasso_beta=self.alpha,
            ridge_beta=self.beta,
            hidden_layer_units=self.hidden_layer_units,
            w_threshold=self.threshold,
            tabu_edges=self.tabu_edges,
            tabu_parent_nodes=tabu_parent_nodes,
            tabu_child_nodes=self.tabu_child_nodes,
            use_bias=self.fit_intercept,
            **self.kwargs,
        )

        # keep thresholding until the DAG constraint is enforced
        if self.enforce_dag:
            self.graph_.threshold_till_dag()

        return self
コード例 #13
0
 def test_pandas_notears_with_schema(self, X, schema):
     X = pd.DataFrame(X)
     from_pandas(X, schema)
コード例 #14
0
    def test_non_numeric_data_raises_error(self):
        """Only numeric data frames should be supported"""

        with pytest.raises(ValueError,
                           match="All columns must have numeric data.*"):
            from_pandas(pd.DataFrame(data=["x"], columns=["a"]))
コード例 #15
0
    def test_tabu_expected_edges(self, train_data_idx):
        """Tabu edges should not exist in the network"""

        tabu_e = [("d", "a"), ("b", "c")]
        g = from_pandas(train_data_idx, tabu_edges=tabu_e)
        assert [e not in g.edges for e in tabu_e]
コード例 #16
0
    def test_expected_structure_learned(self, train_data_idx, train_model):
        """Given a small data set that can be examined by hand, the structure should be deterministic"""

        g = from_pandas(train_data_idx, w_threshold=0.25)
        assert set(g.edges) == set(train_model.edges)
コード例 #17
0
    def test_isolated_nodes_exist(self, train_data_idx):
        """Isolated nodes should still be in the learned structure"""

        g = from_pandas(train_data_idx, w_threshold=1.0)
        assert len(g.nodes) == len(train_data_idx.columns)
コード例 #18
0
    def test_all_columns_in_structure(self, train_data_idx):
        """Every columns that is in the data should become a node in the learned structure"""

        g = from_pandas(train_data_idx)
        assert len(g.nodes) == len(train_data_idx.columns)
コード例 #19
0
    def test_tabu_expected_parent_nodes(self, train_data_idx):
        """Tabu parent nodes should not have any outgoing edges"""

        tabu_p = ["a", "d", "b"]
        g = from_pandas(train_data_idx, tabu_parent_nodes=tabu_p)
        assert [p not in [e[0] for e in g.edges] for p in tabu_p]
コード例 #20
0
    def test_sparsity_against_without_reg(self, train_data_idx):
        """Structure learnt from regularisation should be sparser than the one without"""

        g1 = from_pandas(train_data_idx, lasso_beta=10.0, w_threshold=0.25)
        g2 = from_pandas(train_data_idx, w_threshold=0.25)
        assert len(g1.edges) < len(g2.edges)
コード例 #21
0
    def test_tabu_expected_child_nodes(self, train_data_idx):
        """Tabu child nodes should not have any ingoing edges"""

        tabu_c = ["a", "d", "b"]
        g = from_pandas(train_data_idx, tabu_child_nodes=tabu_c)
        assert [c not in [e[1] for e in g.edges] for c in tabu_c]