def runSearch(data,algo_var_types): # run the search ic_algorithm = IC(RobustRegressionTest) graph = ic_algorithm.search(data, algo_var_types) results = graph.edges(data=True) #pprint(results) return (results)
class Test_IC(TestAPI): def setUp(self): x1 = numpy.random.normal(size=TEST_SET_SIZE) x2 = x1 + numpy.random.normal(size=TEST_SET_SIZE) x3 = x1 + numpy.random.normal(size=TEST_SET_SIZE) x4 = x2 + x3 + numpy.random.normal(size=TEST_SET_SIZE) x5 = x4 + numpy.random.normal(size=TEST_SET_SIZE) self.X = pd.DataFrame({'x1' : x1, 'x2' : x2, 'x3' : x3, 'x4' : x4, 'x5' : x5}) self.variable_types = {'x1' : 'c', 'x2' : 'c', 'x3' : 'c', 'x4' : 'c', 'x5' : 'c'} self.true_neighbors = { 'x1' : set(['x2','x3']), 'x2' : set(['x1','x4']), 'x3' : set(['x1','x4']), 'x4' : set(['x2','x3','x5']), 'x5' : set(['x4'])} self.true_colliders = set([('x3','x4'), ('x2','x4')]) self.true_marked = set([('x4','x5')]) self.ic = IC(RobustRegressionTest, self.X, self.variable_types) def test_build_g(self): self.ic._build_g() V = len(self.X.columns) assert(len(self.ic._g.edges()) == (V-1)*V / 2) assert(set(self.ic._g.nodes()) == set(self.variable_types.keys())) for node, variable_type in self.variable_types.items(): assert(self.ic._g.node[node]['type'] == variable_type) for i, j in self.ic._g.edges(): assert(self.ic._g.edge[i][j]['marked'] == False) def test_find_skeleton(self): self.ic._build_g() self.ic._find_skeleton() for node, neighbors in self.true_neighbors.items(): assert(set(self.ic._g.neighbors(node)) == neighbors) def test_orient_colliders(self): self.ic._build_g() self.ic._find_skeleton() self.ic._orient_colliders() for i, j in self.ic._g.edges(): measured_colliders = self.ic._g.edge[i][j]['arrows'] if len(measured_colliders) > 0: if j in measured_colliders: assert((i,j) in self.true_colliders) else: assert((j,i) in self.true_colliders) else: assert((i,j) not in self.true_colliders and (j,i) not in self.true_colliders) def test_separating_set(self): self.ic._build_g() self.ic._find_skeleton() for xi, xj in itertools.combinations(self.variable_types.keys(), 2): if not self.ic._g.has_edge(xi,xj): if (xi,xj) in self.ic.separating_sets: z = self.ic.separating_sets[(xi,xj)] else: z = self.ic.separating_sets[(xj,xi)] test = self.ic.independence_test([xj],[xi], list(z), self.X, self.ic.alpha) assert(test.independent()) def test_marked_directed_path(self): marked_edges = [('a','b'),('b','c'),('c','d')] unmarked_edges = [('a','d')] nodes = ['a','b','c','d'] g = nx.Graph() g.add_edges_from(marked_edges, marked=True) g.add_edges_from(unmarked_edges, marked=False) for i, j in (marked_edges + unmarked_edges): g.edge[i][j]['arrows'] = [j] self.ic._g = g assert(self.ic._marked_directed_path('a','d')) assert(not self.ic._marked_directed_path('d','a')) def test_recursion_rule_1(self): pass def test_recursion_rule_2(self): pass def test_search(self): self.ic.search() for i, j in self.ic._g.edges(): if self.ic._g.edge[i][j]['marked']: assert( (i,j) in self.true_marked or (j,i) in self.true_marked) else: assert( (i,j) not in self.true_marked and (j,i) not in self.true_marked)
x1 = np.random.normal(size=SIZE) x2 = x1 + np.random.normal(size=SIZE) x3 = x1 + np.random.normal(size=SIZE) x4 = x2 + x3 + np.random.normal(size=SIZE) x5 = x4 + np.random.normal(size=SIZE) # load the data into a dataframe: X = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3, 'x4': x4, 'x5': x5}) # define the variable types: 'c' is 'continuous'. The variables defined here # are the ones the search is performed over -- NOT all the variables defined # in the data frame. variable_types = {'x1': 'c', 'x2': 'c', 'x3': 'c', 'x4': 'c', 'x5': 'c'} ic_algorithm = IC(RobustRegressionTest) graph = ic_algorithm.search(X, variable_types) e = graph.edges(data=True) print(f"{e}") SIZE = 2000 x1 = np.random.normal(size=SIZE) x2 = x1 + np.random.normal(size=SIZE) x3 = x1 + np.random.normal(size=SIZE) x6 = np.random.normal(size=SIZE) x4 = x2 + x3 + x6 + np.random.normal(size=SIZE) x5 = x6 + np.random.normal(size=SIZE) # load the data into a dataframe: X = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3, 'x4': x4, 'x5': x5})
def fit(self, X): ''' Copulafit using Gaussian copula with marginals evaluated by Gaussian KDE Precision matrix is evaluated using specified method, default to graphical LASSO :param X: input dataset :return: estimated precision matrix rho ''' N, d = X.shape if self.scaler is not None: X_scale = self.scaler.fit_transform(X) else: X_scale = X if len(self.vertexes) == 0: self.vertexes = [str(id) for id in range(d)] self.theta = 1.0 / N cum_marginals = np.zeros_like(X) inv_norm_cdf = np.zeros_like(X) # inv_norm_cdf_scaled = np.zeros_like(X) self.kernels = list([]) # TODO: complexity O(Nd) is high if self.verbose: colored('>> Computing marginals', color='blue') for j in range(cum_marginals.shape[1]): self.kernels.append(gaussian_kde(X_scale[:, j])) cum_pdf_overall = self.kernels[-1].integrate_box_1d( X_scale[:, j].min(), X_scale[:, j].max()) for i in range(cum_marginals.shape[0]): cum_marginals[i, j] = self.kernels[-1].integrate_box_1d( X_scale[:, j].min(), X_scale[i, j]) / cum_pdf_overall # truncate cumulative marginals if cum_marginals[i, j] < self.theta: cum_marginals[i, j] = self.theta elif cum_marginals[i, j] > 1 - self.theta: cum_marginals[i, j] = 1 - self.theta # inverse of normal CDF: \Phi(F_j(x))^{-1} inv_norm_cdf[i, j] = norm.ppf(cum_marginals[i, j]) # scaled to preserve mean and variance: u_j + \sigma_j*\Phi(F_j(x))^{-1} # inv_norm_cdf_scaled[i, j] = X_scale[:, j].mean() + X_scale[:, j].std() * inv_norm_cdf[i, j] if self.method == 'mle': # maximum-likelihood estiamtor empirical_cov = EmpiricalCovariance() empirical_cov.fit(inv_norm_cdf) if self.verbose: print colored('>> Running MLE to estiamte precision matrix', color='blue') self.est_cov = empirical_cov.covariance_ self.corr = scale_matrix(self.est_cov) self.precision_ = inv(empirical_cov.covariance_) if self.method == 'glasso': if self.verbose: print colored('>> Running glasso to estiamte precision matrix', color='blue') empirical_cov = EmpiricalCovariance() empirical_cov.fit(inv_norm_cdf) # shrunk convariance to avoid numerical instability shrunk_cov = shrunk_covariance(empirical_cov.covariance_, shrinkage=0.8) self.est_cov, self.precision_ = graph_lasso(emp_cov=shrunk_cov, alpha=self.penalty, verbose=self.verbose, max_iter=self.max_iter) self.corr = scale_matrix(self.est_cov) if self.method == 'ledoit_wolf': if self.verbose: print colored( '>> Running ledoit_wolf to estiamte precision matrix', color='blue') self.est_cov, _ = ledoit_wolf(inv_norm_cdf) self.corr = scale_matrix(self.est_cov) self.precision_ = linalg.inv(self.est_cov) if self.method == 'spectral': '''L2 mehtod, use paper Inverse covariance estimation for high dimension data in linear time and space :formular: in paper eq(8) ''' if self.verbose: print colored( '>> Running Riccati to estiamte precision matrix', color='blue') # TODO: note estimated cov is sample cov self.est_cov, self.precision_ = spectral(inv_norm_cdf, rho=2 * self.penalty, assume_centered=False) self.corr = scale_matrix(self.est_cov) if self.method == 'pc': clf = pgmlearner.PGMLearner() data_list = list([]) for row_id in range(X_scale.shape[0]): instance = dict() for i, n in enumerate(self.vertexes): instance[n] = X_scale[row_id, i] data_list.append(instance) graph = clf.lg_constraint_estimatestruct(data=data_list, pvalparam=self.pval, bins=self.bins) dag = np.zeros(shape=(len(graph.V), len(graph.V))) for e in graph.E: dag[self.vertexes.index(e[0]), self.vertexes.index(e[1])] = 1 self.conditional_independences_ = dag if self.method == 'ic': df = dict() variable_types = dict() for j in range(X_scale.shape[1]): df[self.vertexes[j]] = X_scale[:, j] variable_types[self.vertexes[j]] = 'c' data = pd.DataFrame(df) # run the search ic_algorithm = IC(RobustRegressionTest, data, variable_types, alpha=self.pval) graph = ic_algorithm.search() dag = np.zeros(shape=(X_scale.shape[1], X_scale.shape[1])) for e in graph.edges(data=True): i = self.vertexes.index(e[0]) j = self.vertexes.index(e[1]) dag[i, j] = 1 dag[j, i] = 1 arrows = set(e[2]['arrows']) head_len = len(arrows) if head_len > 0: head = arrows.pop() if head_len == 1 and head == e[0]: dag[i, j] = 0 if head_len == 1 and head == e[1]: dag[j, i] = 0 self.conditional_independences_ = dag # finally we fit the structure self.fit_structure(self.precision_)
class Test_IC(TestAPI): def setUp(self): x1 = numpy.random.normal(size=TEST_SET_SIZE) x2 = x1 + numpy.random.normal(size=TEST_SET_SIZE) x3 = x1 + numpy.random.normal(size=TEST_SET_SIZE) x4 = x2 + x3 + numpy.random.normal(size=TEST_SET_SIZE) x5 = x4 + numpy.random.normal(size=TEST_SET_SIZE) self.X = pd.DataFrame({ "x1": x1, "x2": x2, "x3": x3, "x4": x4, "x5": x5 }) self.variable_types = { "x1": "c", "x2": "c", "x3": "c", "x4": "c", "x5": "c" } self.true_neighbors = { "x1": set(["x2", "x3"]), "x2": set(["x1", "x4"]), "x3": set(["x1", "x4"]), "x4": set(["x2", "x3", "x5"]), "x5": set(["x4"]), } self.true_colliders = set([("x3", "x4"), ("x2", "x4")]) self.true_marked = set([("x4", "x5")]) self.ic = IC(RobustRegressionTest) self.ic.search(self.X, self.variable_types) def test_build_g(self): self.ic._build_g(self.variable_types) V = len(self.X.columns) assert len(self.ic._g.edges()) == (V - 1) * V / 2 assert set(self.ic._g.nodes()) == set(self.variable_types.keys()) for node, variable_type in self.variable_types.items(): assert self.ic._g.nodes[node]["type"] == variable_type for i, j in self.ic._g.edges(): assert self.ic._g.get_edge_data(i, j)["marked"] == False def test_find_skeleton(self): self.ic._build_g(self.variable_types) self.ic._find_skeleton(self.X, self.variable_types) for node, neighbors in self.true_neighbors.items(): assert set(self.ic._g.neighbors(node)) == neighbors def test_orient_colliders(self): self.ic._build_g(self.variable_types) self.ic._find_skeleton(self.X, self.variable_types) self.ic._orient_colliders() for i, j in self.ic._g.edges(): measured_colliders = self.ic._g.get_edge_data(i, j)["arrows"] if len(measured_colliders) > 0: if j in measured_colliders: assert (i, j) in self.true_colliders else: assert (j, i) in self.true_colliders else: assert (i, j) not in self.true_colliders and ( j, i, ) not in self.true_colliders def test_separating_set(self): self.ic._build_g(self.variable_types) self.ic._find_skeleton(self.X, self.variable_types) for xi, xj in itertools.combinations(self.variable_types.keys(), 2): if not self.ic._g.has_edge(xi, xj): if (xi, xj) in self.ic.separating_sets: z = self.ic.separating_sets[(xi, xj)] else: z = self.ic.separating_sets[(xj, xi)] test = self.ic.independence_test([xj], [xi], list(z), self.X, self.ic.alpha) assert test.independent() def test_marked_directed_path(self): marked_edges = [("a", "b"), ("b", "c"), ("c", "d")] unmarked_edges = [("a", "d")] nodes = ["a", "b", "c", "d"] g = nx.Graph() g.add_edges_from(marked_edges, marked=True) g.add_edges_from(unmarked_edges, marked=False) for i, j in marked_edges + unmarked_edges: g.get_edge_data(i, j)["arrows"] = [j] self.ic._g = g assert self.ic._marked_directed_path("a", "d") assert not self.ic._marked_directed_path("d", "a") def test_recursion_rule_1(self): pass def test_recursion_rule_2(self): pass def test_search(self): self.ic.search(self.X, self.variable_types) for i, j in self.ic._g.edges(): if self.ic._g.get_edge_data(i, j)["marked"]: assert (i, j) in self.true_marked or (j, i) in self.true_marked else: assert (i, j) not in self.true_marked and ( j, i) not in self.true_marked
import numpy import pandas as pd from causality.inference.search import IC from causality.inference.independence_tests import RobustRegressionTest # generate some toy data: SIZE = 2000 x1 = numpy.random.normal(size=SIZE) x2 = x1 + numpy.random.normal(size=SIZE) x3 = x1 + numpy.random.normal(size=SIZE) x4 = x2 + x3 + numpy.random.normal(size=SIZE) x5 = x4 + numpy.random.normal(size=SIZE) # load the data into a dataframe: X = pd.DataFrame({'x1' : x1, 'x2' : x2, 'x3' : x3, 'x4' : x4, 'x5' : x5}) # define the variable types: 'c' is 'continuous'. The variables defined here # are the ones the search is performed over -- NOT all the variables defined # in the data frame. variable_types = {'x1' : 'c', 'x2' : 'c', 'x3' : 'c', 'x4' : 'c', 'x5' : 'c'} # run the search ic_algorithm = IC(RobustRegressionTest, X, variable_types) graph = ic_algorithm.search()