def _run_igsp(dag_num): # === GENERATE FILENAME sample_folder = sample_folders[dag_num] alg_folder = os.path.join(sample_folder, 'estimates', 'igsp') os.makedirs(alg_folder, exist_ok=True) filename = os.path.join( alg_folder, 'nruns=%d,depth=%d,alpha=%.2e,alpha_invariant=%.2e.npy' % (nruns, depth, alpha, alpha_invariant)) # === RUN ALGORITHM if not os.path.exists(filename) or overwrite: obs_samples, setting_list, sample_dict = get_dag_samples( ndags, nnodes, nneighbors, nsamples, nsettings, num_known, num_unknown, intervention, dag_num, nonlinear=nonlinear) if nonlinear: suffstat = gauss_ci_suffstat(obs_samples) suffstat_inv = gauss_invariance_suffstat( obs_samples, [setting['samples'] for setting in setting_list]) ci_tester = MemoizedCI_Tester(gauss_ci_test, suffstat, alpha=alpha) inv_tester = MemoizedInvarianceTester(gauss_invariance_test, suffstat_inv, alpha=alpha_invariant) else: suffstat = gauss_ci_suffstat(obs_samples) suffstat_inv = gauss_invariance_suffstat( obs_samples, [setting['samples'] for setting in setting_list]) ci_tester = MemoizedCI_Tester(gauss_ci_test, suffstat, alpha=alpha) inv_tester = MemoizedInvarianceTester(gauss_invariance_test, suffstat_inv, alpha=alpha_invariant) est_dag = igsp([{ 'interventions': setting['known_interventions'] } for setting in setting_list], nodes, ci_tester, inv_tester, depth=depth, nruns=nruns) np.save(filename, est_dag.to_amat()[0]) return est_dag else: return cd.DAG.from_amat(np.load(filename))
def prepare_igsp(obs_samples, iv_samples_list, targets_list, alpha=1e-3, alpha_inv=1e-3, ci_test="gaussian"): # Form sufficient statistics if ci_test == "gaussian": obs_suffstat = gauss_ci_suffstat(obs_samples) invariance_suffstat = gauss_invariance_suffstat( obs_samples, iv_samples_list) # Create CI and invariance ci_tester = MemoizedCI_Tester(gauss_ci_test, obs_suffstat, alpha=alpha) invariance_tester = MemoizedInvarianceTester(gauss_invariance_test, invariance_suffstat, alpha=alpha_inv) elif ci_test == "hsic": contexts = {i: s for i, s in enumerate(iv_samples_list)} invariance_suffstat = {"obs_samples": obs_samples} invariance_suffstat.update(contexts) # Create CI and invariance ci_tester = MemoizedCI_Tester(hsic_test, obs_samples, alpha=alpha) invariance_tester = MemoizedInvarianceTester(hsic_invariance_test, invariance_suffstat, alpha=alpha_inv) elif ci_test == "kci": contexts = {i: s for i, s in enumerate(iv_samples_list)} invariance_suffstat = {"obs_samples": obs_samples} invariance_suffstat.update(contexts) # Create CI and invariance ci_tester = MemoizedCI_Tester(kci_test, obs_samples, alpha=alpha) invariance_tester = MemoizedInvarianceTester(kci_invariance_test, invariance_suffstat, alpha=alpha_inv) else: raise ValueError( f"CI test '{ci_test}' does not exist. Choose between: [gaussian, hsic, kci]" ) return ci_tester, invariance_tester
samples = pd.read_csv(os.path.join(SACHS_DATA_FOLDER, file), sep=',') iv_str = file.split('=')[1][:-4] ivs = frozenset({int(iv_str)}) if iv_str != '' else frozenset() sample_dict[ivs] = samples.values obs_samples = sample_dict[frozenset()] all_samples = np.concatenate(tuple(sample_dict.values()), axis=0) suffstat = gauss_ci_suffstat(obs_samples) suffstat_all = dict(C=np.corrcoef(all_samples, rowvar=False), n=all_samples.shape[0]) setting_list = [ {'known_interventions': iv_nodes} for iv_nodes, samples in sample_dict.items() if iv_nodes != frozenset() ] iv_samples_list = [sample_dict[setting['known_interventions']] for setting in setting_list] invariance_suffstat = gauss_invariance_suffstat(obs_samples, iv_samples_list) hsic_invariance_suffstat = {iv: samples for iv, samples in enumerate(iv_samples_list)} hsic_invariance_suffstat['obs_samples'] = obs_samples # === RUN UNKNOWN TARGET IGSP WITH GAUSS CI for alpha in tqdm([1e-1, 1e-2, 1e-3, 2e-1, 3e-1, 4e-1, 5e-1, 5e-2]): alpha_i = 1e-20 filename = os.path.join(ESTIMATED_FOLDER, 'utigsp_gauss_ci_alpha=%.2e.txt,alpha_i=%.2e.txt' % (alpha, alpha_i)) ci_tester = MemoizedCI_Tester(gauss_ci_test, suffstat, alpha=alpha) invariance_tester = MemoizedInvarianceTester(gauss_invariance_test, invariance_suffstat, alpha=alpha_i) # invariance_tester = MemoizedInvarianceTester(hsic_invariance_test, hsic_invariance_suffstat, alpha=alpha_i) if OVERWRITE or not os.path.exists(filename): est_dag, learned_interventions = unknown_target_igsp( setting_list, set(range(nnodes)), ci_tester,
def get_bs_dags(num_bs, obs_samples, nsamples_obs, nnodes, cheat_cpdag=None, bic=True): """ takes in a number of bootstrap dags and observational data, outputs a list of bootstrapped dags cheat_dag is for debugging and doing experimets where we allow access to the MEC: on the first round forces the cheat cpdag into the sample """ #subsample data in DAG bootstrap, and learn the DAG + MLE estimates of parameters bs_dags = [] # a list of the dags we get from the bootstrap bs_index = {} #a mapping from dag string to index in the list count_dags = 0 #number unique dags total_dags = 0 #number of dags samples_per_bs = nsamples_obs nodes = set(range(nnodes)) while total_dags < num_bs: if total_dags == 0 and isinstance(cheat_cpdag, np.ndarray): est_cpdag = cheat_cpdag else: bs_i = np.random.choice(nsamples_obs, samples_per_bs, replace=True) bs_data = obs_samples[bs_i] #from this sample learn the DAG and an MLE of the parameters obs_suffstat = gauss_ci_suffstat(bs_data) invariance_suffstat = gauss_invariance_suffstat(obs_samples, []) alpha = 1e-3 alpha_inv = 1e-3 ci_tester = MemoizedCI_Tester(gauss_ci_test, obs_suffstat, alpha=alpha) invariance_tester = MemoizedInvarianceTester(gauss_invariance_test, invariance_suffstat, alpha=alpha_inv) setting_list = [] est_dag, est_targets_list = unknown_target_igsp(setting_list, nodes, ci_tester, invariance_tester, nruns=5, depth=None) est_dag = est_dag.to_amat()[0] est_cpdag = main.cpdag_from_dag_observational(est_dag) if mec_size.mec_size(est_cpdag) <= num_bs: #now compute the mec and add all mec members mec_dags = mec_size.enumerate_dags(est_cpdag) else: #get just enough dags if mec too big mec_dags = mec_size.uniform_sample_dag_plural(est_cpdag, num_bs - len(bs_dags), exact=False) for est_dag in mec_dags: if est_dag.tobytes() in bs_index: #increase weight by one if we double count bs_dags[bs_index[est_dag.tobytes()]]['w'] += (1 / num_bs) bs_dags[bs_index[est_dag.tobytes()]]['count'] += 1 #count is the number of times the dag appears in the multiset else: A, b = finite.get_weights_MLE(est_dag, obs_samples) bs_dags.append({ 'dag': est_dag, 'A': A, 'b': b, 'w': (1 / num_bs), 'count': 1 }) bs_index[est_dag.tobytes()] = count_dags count_dags += 1 total_dags += 1 if total_dags > num_bs: break #for now we've just made all the weights 1/num_dags #now correct weights by computing the posterior of each DAG T = len(bs_dags) logPy = finite.llhood( [obs_samples], [[]], bs_dags, (np.zeros(nnodes), 0)) #getting the likelihood of the observations #print(logPy) weighted_logPy = np.zeros(T) for i in range(T): #use the BIC weighted_logPy[i] = logPy[i] + np.log( bs_dags[i] ['w']) #- np.sum(bs_dags[i]['dag']) * np.log(nsamples_obs) / 2 if bic: weighted_logPy[i] = weighted_logPy[i] - np.sum( bs_dags[i]['dag']) * np.log(nsamples_obs) / 2 denom = logsumexp(weighted_logPy) #now set w for each DAG to be the posterior for i in range(T): bs_dags[i]['w'] = np.exp(weighted_logPy[i] - denom) #remove all the tiny weights and renormalize w_sum = 0 bs_dags_pruned = [] for i in range(T): if bs_dags[i]['w'] >= 0.001: bs_dags_pruned.append(bs_dags[i]) w_sum += bs_dags[i]['w'] T = len(bs_dags_pruned) for i in range(T): bs_dags_pruned[i]['w'] = bs_dags_pruned[i]['w'] / w_sum return bs_dags_pruned
for known_iv in known_iv_list ] all_ivs_list = [{ known_iv, *unknown_ivs } for known_iv, unknown_ivs in zip(known_iv_list, unknown_ivs_list)] nsamples = 5000 obs_samples = g.sample(nsamples) iv_samples_list = [ g.sample_interventional({iv: INTERVENTION for iv in all_ivs}, nsamples) for all_ivs in all_ivs_list ] ci_suffstat = gauss_ci_suffstat(obs_samples) inv_suffstat = gauss_invariance_suffstat(obs_samples, context_samples_list=iv_samples_list) ci_tester = MemoizedCI_Tester(gauss_ci_test, ci_suffstat) invariance_tester = MemoizedInvarianceTester(gauss_invariance_test, inv_suffstat) combined_ci_tester = MemoizedCI_Tester(combined_gauss_ci_test, dict(ci=ci_suffstat, invariance=inv_suffstat), alpha_inv=1e-5) est_dag, est_targets_list = jci_gsp( [dict(known_interventions={known_iv}) for known_iv in known_iv_list], nodes, combined_ci_tester, verbose=True)
def _run_utigsp(dag_num): # === GENERATE FILENAME sample_folder = sample_folders[dag_num] alg_folder = os.path.join(sample_folder, 'estimates', 'utigsp') os.makedirs(alg_folder, exist_ok=True) no_target_str = '_no_targets' if no_targets else '' filename = os.path.join( alg_folder, f'nruns=%d,depth=%d,alpha=%.2e,alpha_invariant=%.2e{no_target_str}.npy' % (nruns, depth, alpha, alpha_invariant)) # === RUN ALGORITHM if not os.path.exists(filename) or overwrite: obs_samples, setting_list, _ = get_dag_samples(ndags, nnodes, nneighbors, nsamples, nsettings, num_known, num_unknown, intervention, dag_num, nonlinear=nonlinear) if nonlinear: suffstat = gauss_ci_suffstat(obs_samples) suffstat_inv = gauss_invariance_suffstat( obs_samples, [setting['samples'] for setting in setting_list]) ci_tester = MemoizedCI_Tester(gauss_ci_test, suffstat, alpha=alpha) inv_tester = MemoizedInvarianceTester(gauss_invariance_test, suffstat_inv, alpha=alpha_invariant) else: suffstat = gauss_ci_suffstat(obs_samples) suffstat_inv = gauss_invariance_suffstat( obs_samples, [setting['samples'] for setting in setting_list]) ci_tester = MemoizedCI_Tester(gauss_ci_test, suffstat, alpha=alpha) inv_tester = MemoizedInvarianceTester(gauss_invariance_test, suffstat_inv, alpha=alpha_invariant) est_dag, learned_intervention_targets = unknown_target_igsp( [{ 'known_interventions': setting['known_interventions'] } for setting in setting_list], nodes, ci_tester, inv_tester, depth=depth, nruns=nruns, no_targets=no_targets) np.save(filename, est_dag.to_amat()[0]) json.dump( list(map(list, learned_intervention_targets)), open(filename + '_learned_intervention_targets.json', 'w')) return est_dag, learned_intervention_targets else: learned_intervention_targets = json.load( open(filename + '_learned_intervention_targets.json')) return cd.DAG.from_amat( np.load(filename)), learned_intervention_targets