def posterior(epsilon, bs_dags, true_dag_dict, iv_means, iv_var, K): """ computes the posterior given some interventions by sampling data from the true dag and using the list of bootstrapped dags as the support of the posterior """ #read interventional data in T = len(bs_dags) # Generate observational data g = cd.GaussDAG.from_amat(np.asarray(true_dag_dict['A'])) nsamples_iv = K ivs = [{ target: cd.GaussIntervention(iv_means[target], iv_var) for target in targets } for targets in epsilon] y = [g.sample_interventional(iv, nsamples_iv) for iv in ivs] #convert epsilon to numpy logPy = finite.llhood(y, epsilon, bs_dags, (iv_means, iv_var)) weighted_logPy = np.zeros(T) for j in range(T): weighted_logPy[j] = np.log(bs_dags[j]['w']) + logPy[j] P2 = np.zeros(T) #this will be the log dist, we'll convert after denom = logsumexp(weighted_logPy) for j in range(T): P2[j] = weighted_logPy[j] - denom P2 = np.exp(P2) #currently just have the log dist for j in range(T): bs_dags[j]['w'] = P2[j] return bs_dags
def MI_obj(epsilon, verbose=False, iter=False): #epsilon is a list of lists #to speed-up run-times, we just assume you get K samples of each intervention #t0= time.time() obj = 0 if len(epsilon) == 0: return -np.inf for i in range(T): for _ in range(M): #sample y_mt from the intervention and compute p(y) given each possible DAG #first build a causaldag representation of the dag and adjacency, then sample from it #using the intervention #t1=time.time() cdag = cd.GaussDAG.from_amat(bs_dags[i]['A'], variances=bs_dags[i]['b']) #print(time.time()-t1) nsamples_iv = K #t2 = time.time() ivs = [{ target: cd.GaussIntervention(iv_means[target], iv_var) for target in targets } for targets in epsilon] y_mt = [ cdag.sample_interventional(iv, nsamples_iv) for iv in ivs ] logPy = finite.llhood(y_mt, epsilon, bs_dags, (iv_means, iv_var)) #print(time.time()-t2) weighted_logPy = np.zeros(T) for j in range(T): weighted_logPy[j] = np.log( bs_dags[j]['w'] / sum_ws) + logPy[j] #don't need compute P1 for entropies since it's constant #P2 is a categorical dist over DAGS P2 = np.zeros( T) #this will be the log dist, we'll convert after denom = logsumexp(weighted_logPy) for j in range(T): P2[j] = weighted_logPy[j] - denom P2 = np.exp(P2) #currently just have the log dist if verbose: print(P2) H2 = entropy(P2) #H2 is just the entropy induced by P2 obj = obj - H2 * ws[i] / (M * sum_ws) #print(time.time()-t0) return obj + entropy(ws) #add prior entropy so > 0
import causaldag as cd import time import numpy as np g = cd.GaussDAG([0, 1, 2], arcs={(0, 1), (0, 2)}) cov = g.covariance nsamples = 2500 trials = 10 iv = {1: cd.GaussIntervention(0, 1)} # === TIME INVARIANCE TEST NO CONDITIONING SET start = time.time() for _ in range(trials): samples = g.sample(nsamples) cd.utils.ci_tests.hsic_test_vector(samples[:, 0], samples[:, 1]) print(time.time() - start) # === TIME INVARIANCE TEST WITH CONDITIONING SET # start = time.time() # for _ in range(trials): # samples = g.sample(nsamples) # cd.utils.ci_tests.hsic_invariance_test(samples[:, 0], samples[:, 1], 0) # print(time.time() - start)
from causaldag.inference.structural import igsp from causaldag.utils.ci_tests import gauss_ci_test, hsic_invariance_test import causaldag as cd import numpy as np import random np.random.seed(40) random.seed(9879132) nnodes = 10 nsamples = 100 dag = cd.rand.directed_erdos(nnodes, 1.5 / (nnodes - 1), 1) gdag = cd.rand.rand_weights(dag) obs_samples = gdag.sample(nsamples) sample_dict = {} sample_dict[frozenset()] = obs_samples for i in range(10): sample_dict[frozenset({i})] = gdag.sample_interventional_perfect( {i: cd.GaussIntervention(1, .1)}, nsamples) suffstat = dict(C=np.corrcoef(obs_samples, rowvar=False), n=nsamples) est_dag = igsp(sample_dict, suffstat, nnodes, gauss_ci_test, hsic_invariance_test, 1e-5, 1e-5, nruns=5, verbose=True)
import numpy as np import random import causaldag as cd import itertools as itr import os from config import PROJECT_FOLDER DATA_FOLDER = os.path.join(PROJECT_FOLDER, 'simulations', 'data') INTERVENTIONS = { 'perfect1': cd.GaussIntervention(1, .01), 'perfect2': cd.GaussIntervention(1, .1), 'inhibitory1': cd.ScalingIntervention(.1, .2), 'soft1': cd.ScalingIntervention(.1, .2, mean=1), 'zero': cd.GaussIntervention(0, 1), 'shift': cd.ShiftIntervention(2) } def get_dag_folder(ndags, nnodes, nneighbors, dag_num, nonlinear=False): nonlinear_str = '_nonlinear' if nonlinear else '' base_folder = os.path.join( DATA_FOLDER, f'nnodes={nnodes}_nneighbors={nneighbors}_ndags={ndags}{nonlinear_str}' ) return os.path.join(base_folder, f'dag{dag_num}') def get_sample_folder(ndags, nnodes, nneighbors,
import causaldag as cd import matplotlib.pyplot as plt import os import random import numpy as np from numpy.ma import masked_array random.seed(181) np.random.seed(181) nsamples = 10 g = cd.GaussDAG(nodes=[0, 1, 2], arcs={(0, 1): 1, (0, 2): 1}, variances=[1, .2, .2]) obs_samples = g.sample(nsamples) iv1_samples = g.sample_interventional({1: cd.GaussIntervention(0, 1)}, nsamples=nsamples) iv01_samples = g.sample_interventional({1: cd.GaussIntervention(0, 1), 0: cd.GaussIntervention(1, .1)}, nsamples=nsamples) cmap = 'bwr' plt.clf() os.makedirs('figures/example_data/', exist_ok=True) plt.imshow(obs_samples, cmap=cmap) plt.xticks([]) plt.yticks([]) plt.tight_layout() plt.savefig('figures/example_data/obs.png', transparent=True, bbox_inches='tight') plt.clf() plt.imshow(iv1_samples, cmap=cmap) plt.xticks([]) plt.yticks([])
def simulate(strategy, simulator_config, gdag, strategy_folder, num_bootstrap_dags_final=100, save_gies=True): if os.path.exists(os.path.join(strategy_folder, 'samples')): return # === SAVE SIMULATION META-INFORMATION os.makedirs(strategy_folder, exist_ok=True) simulator_config.save(strategy_folder) # === SAMPLE SOME OBSERVATIONAL DATA TO START WITH n_nodes = len(gdag.nodes) all_samples = {i: np.zeros([0, n_nodes]) for i in range(n_nodes)} all_samples[-1] = gdag.sample(simulator_config.starting_samples) precision_matrix = np.linalg.inv(all_samples[-1].T @ all_samples[-1] / len(all_samples[-1])) # === GET GIES SAMPLES GIVEN JUST OBSERVATIONAL DATA if save_gies: initial_samples_path = os.path.join(strategy_folder, 'initial_samples.csv') initial_interventions_path = os.path.join(strategy_folder, 'initial_interventions') initial_gies_dags_path = os.path.join(strategy_folder, 'initial_dags/') graph_utils._write_data(all_samples, initial_samples_path, initial_interventions_path) graph_utils.run_gies_boot(num_bootstrap_dags_final, initial_samples_path, initial_interventions_path, initial_gies_dags_path) amats, dags = graph_utils._load_dags(initial_gies_dags_path, delete=True) for d, amat in enumerate(amats): np.save(os.path.join(initial_gies_dags_path, 'dag%d.npy' % d), amat) # === SPECIFY INTERVENTIONAL DISTRIBUTIONS BASED ON EACH NODE'S STANDARD DEVIATION intervention_set = list(range(n_nodes)) if simulator_config.intervention_type == 'node-variance': interventions = [ cd.BinaryIntervention( intervention1=cd.ConstantIntervention(val=-simulator_config.intervention_strength * std), intervention2=cd.ConstantIntervention(val=simulator_config.intervention_strength * std) ) for std in np.diag(gdag.covariance) ** .5 ] elif simulator_config.intervention_type == 'constant-all': interventions = [ cd.BinaryIntervention( intervention1=cd.ConstantIntervention(val=-simulator_config.intervention_strength), intervention2=cd.ConstantIntervention(val=simulator_config.intervention_strength) ) for _ in intervention_set ] elif simulator_config.intervention_type == 'gauss': interventions = [ cd.GaussIntervention(mean=0, variance=simulator_config.intervention_strength) for _ in intervention_set ] elif simulator_config.intervention_type == 'constant': interventions = [ cd.ConstantIntervention(val=0) for _ in intervention_set ] else: raise ValueError if not simulator_config.target_allowed: del intervention_set[simulator_config.target] del interventions[simulator_config.target] print(intervention_set) # === RUN STRATEGY ON EACH BATCH for batch in range(simulator_config.n_batches): print('Batch %d with %s' % (batch, simulator_config)) batch_folder = os.path.join(strategy_folder, 'dags_batch=%d/' % batch) os.makedirs(batch_folder, exist_ok=True) iteration_data = IterationData( current_data=all_samples, max_interventions=simulator_config.max_interventions, n_samples=simulator_config.n_samples, batch_num=batch, n_batches=simulator_config.n_batches, intervention_set=intervention_set, interventions=interventions, batch_folder=batch_folder, precision_matrix=precision_matrix ) recommended_interventions = strategy(iteration_data) if not sum(recommended_interventions.values()) == iteration_data.n_samples / iteration_data.n_batches: raise ValueError('Did not return correct amount of samples') rec_interventions_nonzero = {intv_ix for intv_ix, ns in recommended_interventions.items() if ns != 0} if simulator_config.max_interventions is not None and len(rec_interventions_nonzero) > simulator_config.max_interventions: raise ValueError('Returned too many interventions') for intv_ix, nsamples in recommended_interventions.items(): iv_node = intervention_set[intv_ix] new_samples = gdag.sample_interventional({iv_node: interventions[intv_ix]}, nsamples) all_samples[iv_node] = np.vstack((all_samples[iv_node], new_samples)) samples_folder = os.path.join(strategy_folder, 'samples') os.makedirs(samples_folder, exist_ok=True) for i, samples in all_samples.items(): np.savetxt(os.path.join(samples_folder, 'intervention=%d.csv' % i), samples) # === CHECK THE TOTAL NUMBER OF SAMPLES IS CORRECT nsamples_final = sum(all_samples[iv_node].shape[0] for iv_node in intervention_set + [-1]) if nsamples_final != simulator_config.starting_samples + simulator_config.n_samples: raise ValueError('Did not use all samples') # === GET GIES SAMPLES GIVEN THE DATA FOR THIS SIMULATION if save_gies: final_samples_path = os.path.join(strategy_folder, 'final_samples.csv') final_interventions_path = os.path.join(strategy_folder, 'final_interventions') final_gies_dags_path = os.path.join(strategy_folder, 'final_dags/') graph_utils._write_data(all_samples, final_samples_path, final_interventions_path) graph_utils.run_gies_boot(num_bootstrap_dags_final, final_samples_path, final_interventions_path, final_gies_dags_path) amats, dags = graph_utils._load_dags(final_gies_dags_path, delete=True) for d, amat in enumerate(amats): np.save(os.path.join(final_gies_dags_path, 'dag%d.npy' % d), amat)
# = find entropy mask = probs != 0 plogps = np.zeros(len(probs)) plogps[mask] = np.log2(probs[mask]) * probs[mask] return -plogps.sum() return get_k_entropy np.random.seed(100) g = cd.rand.directed_erdos(10, .5) g = cd.GaussDAG(nodes=list(range(10)), arcs=g.arcs) mec = [ cd.DAG(arcs=arcs) for arcs in cd.DAG(arcs=g.arcs).cpdag().all_dags() ] strat = create_info_gain_strategy_dag_collection( mec, [get_mec_functional_k(mec)], [get_k_entropy_fxn(len(mec))], verbose=True) samples = g.sample(1000) precision_matrix = samples.T @ samples / 1000 sel_interventions = strat( IterationData(current_data={-1: g.sample(1000)}, max_interventions=1, n_samples=500, batch_num=0, n_batches=1, intervention_set=[0, 1, 2], interventions=[cd.GaussIntervention() for _ in range(3)], batch_folder='test_sanity', precision_matrix=precision_matrix))
if __name__ == '__main__': import numpy as np import causaldag as cd from utils.graph_utils import cross_entropy_interventional, get_covariance_interventional, get_precision_interventional from scipy import stats amat1 = np.array([[0, 2, 3], [0, 0, 5], [0, 0, 0]]) g1 = cd.GaussDAG.from_amat(amat1, variances=[2, 2, 2]) amat2 = np.array([[0, 3, 3], [0, 0, 5], [0, 0, 0]]) g2 = cd.GaussDAG.from_amat(amat2) iv_variance = .1 actual = cross_entropy_interventional(g1, g2, 0, iv_variance) g1_samples = g1.sample_interventional( {0: cd.GaussIntervention(mean=0, variance=iv_variance)}, 1000000) g2_logpdfs = g2.logpdf( g1_samples, {0: cd.GaussIntervention(mean=0, variance=iv_variance)}) print('approx', g2_logpdfs.mean()) print('actual', actual) cov1 = get_covariance_interventional(g1, 0, iv_variance) cov2 = get_covariance_interventional(g2, 0, iv_variance) p = 3 .5 * (-p + np.trace(np.linalg.inv(cov2).dot(cov1)) + np.log(np.linalg.det(cov2) - np.log(np.linalg.det(cov1))) + np.log(np.linalg.det(2 * np.pi * np.e * cov1))) samples = stats.multivariate_normal(cov=cov1).rvs(1000000) logpdfs = stats.multivariate_normal(cov=cov2).logpdf(samples)
[0, 2, 3], [0, 0, 5], [0, 0, 0] ]) g1 = cd.GaussDAG.from_amat(amat1, variances=[2, 2, 2]) amat2 = np.array([ [0, 3, 3], [0, 0, 5], [0, 0, 0] ]) g2 = cd.GaussDAG.from_amat(amat2) iv_variance = .1 actual = cross_entropy_interventional(g1, g2, 0, iv_variance) g1_samples = g1.sample_interventional({0: cd.GaussIntervention(mean=0, variance=iv_variance)}, 1000000) g2_logpdfs = g2.logpdf(g1_samples, {0: cd.GaussIntervention(mean=0, variance=iv_variance)}) print('approx', g2_logpdfs.mean()) print('actual', actual) cov1 = get_covariance_interventional(g1, 0, iv_variance) cov2 = get_covariance_interventional(g2, 0, iv_variance) p = 3 .5 * (-p + np.trace(np.linalg.inv(cov2).dot(cov1)) + np.log(np.linalg.det(cov2) - np.log(np.linalg.det(cov1))) + np.log(np.linalg.det(2 * np.pi * np.e * cov1) )) samples = stats.multivariate_normal(cov=cov1).rvs(1000000) logpdfs = stats.multivariate_normal(cov=cov2).logpdf(samples) cd_samples = g1.sample_interventional({0: cd.GaussIntervention(mean=0, variance=iv_variance)}, 1000000) logpdfs_cd_samples = stats.multivariate_normal(cov=cov2).logpdf(cd_samples) print('scipy approx', logpdfs.mean())
from causaldag.utils.ci_tests import gauss_ci_test import numpy as np import random np.random.seed(1729) random.seed(1729) nnodes = 15 g = cd.rand.rand_weights(cd.rand.directed_erdos(nnodes, 3 / (nnodes - 1), 1)) iv_node = random nsamples = 100 samples = { frozenset(): g.sample(nsamples), frozenset({iv_node}): g.sample_interventional_perfect({iv_node: cd.GaussIntervention(1, .1)}, nsamples) } corr = np.corrcoef(samples[frozenset()], rowvar=False) suffstat = dict(C=corr, n=nsamples) profiler = LineProfiler() def run_gsp(): for i in range(20): gsp(suffstat, nnodes, gauss_ci_test, nruns=10) profiler.add_function(gsp) profiler.runcall(run_gsp) profiler.print_stats()
for adj_mat in adj_mats: avg_inc_mat += graph_utils.adj2inc(adj_mat) true_inc_mat = graph_utils.adj2inc(true_adj_mat) roc_curve(true_inc_mat[:, target], avg_inc_mat[:, target]) if __name__ == '__main__': adj_true = np.loadtxt('./data/dataset_5000/graph_2/adjacency.csv') g = cd.from_amat(adj_true) dict_weights = {} for arc in g.arcs: dict_weights[arc] = adj_true[arc[0], arc[1]] gdag = cd.GaussDAG(nodes=list(range(50)), arcs=dict_weights) all_data = [gdag.sample(250)] interventions = [-1] * 250 all_iv = np.random.randint(0, 50, 10) for iv in all_iv: interventions += [iv] * 25 g_iv = cd.GaussIntervention(mean=2, variance=1) all_data.append(gdag.sample_interventional({iv: g_iv}, 25)) all_data = np.vstack(all_data) interventions = np.array(interventions) interventions[interventions != -1] = interventions[interventions != -1] + 1 np.savetxt('./random_data', all_data) np.savetxt('./random_interventions', interventions) interventions[interventions != -1] = interventions[interventions != -1] - 1 graph_utils.run_gies_boot(200, './random_data', './random_interventions') adj_mats = graph_utils.load_adj_mats() np.save('./data/dataset_5000/graph_2/adj_mats_random', adj_mats)
else: context = int(j[1:]) node = i return gauss_invariance_test(suffstat['invariance'], context, node, cond_set=cond_set, alpha=alpha_inv) nnodes = 5 nodes = set(range(nnodes)) nneighbors = 1.5 nsettings = 5 num_unknown_targets = 0 INTERVENTION = cd.GaussIntervention(1, .01) d = cd.rand.directed_erdos(nnodes, nneighbors / (nnodes - 1)) g = cd.rand.rand_weights(d) known_iv_list = random.sample(list(nodes), nsettings) unknown_ivs_list = [ random.sample(list(nodes - {known_iv}), num_unknown_targets) for known_iv in known_iv_list ] all_ivs_list = [{ known_iv, *unknown_ivs } for known_iv, unknown_ivs in zip(known_iv_list, unknown_ivs_list)] nsamples = 5000 obs_samples = g.sample(nsamples) iv_samples_list = [ g.sample_interventional({iv: INTERVENTION
import numpy as np import random np.random.seed(40) random.seed(9879132) nnodes = 10 nsamples = 100 dag = cd.rand.directed_erdos(nnodes, 1.5 / (nnodes - 1), 1) gdag = cd.rand.rand_weights(dag) obs_samples = gdag.sample(nsamples) setting_list = [] for i in range(10): iv_samples = gdag.sample_interventional_perfect( { i: cd.GaussIntervention(1, .1), 0: cd.GaussIntervention(1, .1) }, nsamples) setting_list.append({'known_interventions': {i}, 'samples': iv_samples}) suffstat = dict(C=np.corrcoef(obs_samples, rowvar=False), n=nsamples) est_dag, learned_intervention_targets = unknown_target_igsp( obs_samples, setting_list, suffstat, nnodes, gauss_ci_test, hsic_invariance_test, 1e-5, 1e-5, nruns=5,
from R_algs.wrappers import run_icp import causaldag as cd import os import numpy as np from config import PROJECT_FOLDER nsamples = 10 g = cd.GaussDAG([0, 1, 2], arcs={(0, 1), (1, 2)}) obs_samples = g.sample(nsamples) iv_node = 1 iv_samples = g.sample_interventional_perfect( {iv_node: cd.GaussIntervention(10, .01)}, nsamples) # === SAVE DATA sample_folder = os.path.join(PROJECT_FOLDER, 'tmp_icp_test') iv_sample_folder = os.path.join(sample_folder, 'interventional') os.makedirs(iv_sample_folder, exist_ok=True) np.savetxt(os.path.join(sample_folder, 'observational.txt'), obs_samples) np.savetxt( os.path.join(iv_sample_folder, 'known_ivs=%s;unknown_ivs=.txt' % iv_node), iv_samples) # === RUN ICP run_icp(sample_folder, .01)