def testMI2(seed, k=3, c=0.1, ntests=10, p=float('inf')): np.random.seed(seed) samples = [100, 200, 400, 800, 1600, 3200] knn = KnnEstimator(k=k) trueMI = -1 / 2 * np.log(1 - c**2) errors = np.zeros((2, len(samples))) for jj in range(0, ntests): ii = 0 for n in samples: cov_m = [[1.0, c], [c, 1.0]] data = np.random.multivariate_normal([0, 0], cov_m, n) errors[0, ii] += (trueMI - knn._mi1(data[:, [0]], data[:, [1]])) / ntests errors[1, ii] += (trueMI - knn._mi2(data[:, [0]], data[:, [1]])) / ntests ii += 1 plt.figure() plt.xlim(0.9, len(samples) + 0.1) x = list(range(1, len(samples) + 1, 1)) for ii in range(0, 2): lab = "MI_" + str(ii) plt.plot(x, errors[ii, :], label=lab, marker='o') plt.xticks(x, samples) #plt.legend(loc='center left', bbox_to_anchor=(1, 0.5)) plt.legend(loc='best') plt.show() return errors
def __init__(self, X, algorithm, estimator=None): self.cache = dict() self.nTests = 0 self.X = X if estimator is None: self.estimator = KnnEstimator() else: self.estimator = estimator self.algorithm = algorithm
def testMeander(samples, k=5, permutations=200, seed=123, tests=100, sig=0.05, k_perm=None, corrCheck=False): np.random.seed(seed) ff = FisherCI() knn = KnnEstimator(k=k, permutations=permutations, sig=sig, corrCheck=corrCheck, k_perm=k_perm) #knn = KnnEstimator(k = k, permutations = permutations, sig = sig) #maxS = np.max(samples) XIIYf = 0 XIIYZf = 0 XIIYknn = 0 XIIYZknn = 0 for ii in range(tests): X, Y, Z = createMeanderData(samples) #X= scale(X) #Y = scale(Y) #Z = scale(Z) indepf, depf = ff.independent(X, Y) indepk, depk = knn.independent(X, Y) if (indepf == False): XIIYf += 1 if (indepk == False): XIIYknn += 1 indepf, depf = ff.independent(X, Y, Z) indepk, depk = knn.independent(X, Y, Z) if (indepf == True): XIIYZf += 1 if (indepk == True): XIIYZknn += 1 print("Sample size:", samples) print(" Reject X || Y Accept X || Y | Z") print("Fisher ", XIIYf / tests, " ", XIIYZf / tests) print("kNN ", XIIYknn / tests, " ", XIIYZknn / tests)
def __init__(self, X, MBalgorithm="IAMB", ci_estimator=None, symmetryRule="AND", MBresolve="colliders", mode="UG"): self.X = X self.n, self.d = X.shape self.aMat = np.zeros((self.d, self.d), dtype=np.int) self.MBs = dict() self.symmetryRule = symmetryRule self.MBresolve = MBresolve self.mode = mode if ci_estimator is None: self.estimator = KnnEstimator() else: self.estimator = ci_estimator if MBalgorithm == "IAMB": self.MBalgorithm = IAMB(self.X, estimator=self.estimator, mode=self.mode) elif MBalgorithm == "GS": self.MBalgorithm = GS(self.X, estimator=self.estimator) elif MBalgorithm == "interIAMB": self.MBalgorithm = interIAMB(self.X, estimator=self.estimator) else: print("Warning: MBalgorithm \'", MBalgorithm, "\' is not defined. Using the IAMB-algorithm instead.", sep="") self.MBalgorithm = IAMB(self.X, self.estimator)
def compErrors(data, samples, ks, trueMI, p): nsamples = len(samples) nks = len(ks) res1 = np.zeros((nks, nsamples)) ni = 0 for n in samples: ki = 0 for k in ks: X = data[:n, :] aa = KnnEstimator(k=k, p=p) res1[ki, ni] = trueMI - aa._entropy(X) ki += +1 ni += 1 return (res1)
def compErrors2(x, y, samples, ks, trueMI, p, z=None): nsamples = len(samples) nks = len(ks) res1 = np.zeros((nks, nsamples)) ni = 0 for n in samples: ki = 0 for k in ks: aa = KnnEstimator(k=k, p=p) if z is not None: res1[ki, ni] = trueMI - aa._cmi1(x[:n, :], y[:n, :], z[:n, :]) else: res1[ki, ni] = trueMI - aa._cmi1(x[:n, :], y[:n, :], z) ki += +1 ni += 1 return res1
def visualizeMeanderCI(samples, k=5, permutations=200, seed=123, sig=0.05, k_perm=5, corrCheck=False, data=1): knn = KnnEstimator(k=k, permutations=permutations, sig=sig, corrCheck=corrCheck, k_perm=None) knn_local = KnnEstimator(k=k, permutations=permutations, sig=sig, corrCheck=corrCheck, k_perm=k_perm) if data == 1: X, Y, Z = createMeanderData(samples) elif data == 2: X, Y, Z = creteMeanderDataVstructure(samples) indep, estMI, _, _, MIs = knn._permutationTest(X, Y, Z) indep_l, estMI_l, _, _, MIs_l = knn_local._permutationTest(X, Y, Z) plt.scatter(X, Y) plt.figure(2) sns.kdeplot(np.array(MIs), label="knn", shade=True) ax = sns.kdeplot(np.array(MIs_l), label="knn_local", shade=True) ax.axvline(x=estMI, ymin=0, ymax=1, c="red", label="estimated MI", linestyle="--") ax.legend() print("knn naive permutation: ", indep, "\n", "knn local permutation: ", indep_l) return (indep, indep_l, estMI, estMI_l, ax, MIs, MIs_l)
def __init__(self, X, estimator=None): if estimator is None: estimator = KnnEstimator() MBAlgorithms.__init__(self, X, "Grow-Shrink", estimator)
class MBAlgorithms: def __init__(self, X, algorithm, estimator=None): self.cache = dict() self.nTests = 0 self.X = X if estimator is None: self.estimator = KnnEstimator() else: self.estimator = estimator self.algorithm = algorithm def clearCache(self): self.cache = dict() def resetTestCounter(self): self.nTests = 0 def _dependence(self, var_inx, y, MB): yi = self.X[:, [y]] x = self.X[:, [var_inx]] if len(MB) == 0: z = None else: z = self.X[:, list(MB)] #conditioning set inCache, key = self._isCached(var_inx, y, MB) if inCache: estMI = self.cache[key][1] else: estMI = self.estimator.dependence(x, yi, z) return estMI def _doIndepTest(self, var_inx, y, MB): if MB is None: z = None elif len(MB) == 0: z = None else: z = self.X[:, list(MB)] #conditioning set assert np.isscalar(var_inx) and np.isscalar(y) yi = self.X[:, [y]] x = self.X[:, [var_inx]] inCache, key = self._isCached(var_inx, y, MB) if inCache: indep = self.cache[key][0] else: indep, estMI = self.estimator.independent(x, yi, z) #print("Testing", var_inx, y, "given", MB) #print(" ", indep, estMI) self._putInCache(key, (indep, estMI)) self.nTests += 1 return indep def _isCached(self, x, y, z): key = self._returnKey(x, y, z) if (key in self.cache): #print("cache used", x, y ,z) return (True, key) else: return (False, key) # key is tuple containing tuples (x,y) (sorted) and (z) (sorted) def _returnKey(self, x, y, z): xy = [x, y] xy.sort() zz = list(z) zz.sort() return (tuple(xy), tuple(zz)) def _putInCache(self, key, value): self.cache[key] = value
def __init__(self, X, estimator=None, mode='DAG'): if estimator is None: estimator = KnnEstimator() MBAlgorithms.__init__(self, X, "interIAMB", estimator) self.mode = mode
def mvnormal_cmi_null(samples=100, t=10000, k=3, sig=0.05, permutations=200, k_perm=None): icmat = np.array([[1, 0, 0.2], [0, 1, 0.8], [0.2, 0.8, 1]]) c_mat = np.linalg.inv(icmat) meann = c_mat.shape[0] * [0] true_cmi_dep = mvnCMI(c_mat, [1], [2], [0]) true_cmi_indep = mvnCMI(c_mat, [0], [1], [2]) knn = KnnEstimator(k=k, sig=sig, permutations=permutations, corrCheck=False, k_perm=k_perm) cmi_dep = [] cmi_indep = [] for ii in range(0, t): X = np.random.multivariate_normal(meann, c_mat, samples) c_mat_est = np.cov(X, rowvar=False) cmi_dep.append(mvnCMI(c_mat_est, [0], [2], [1])) cmi_indep.append(mvnCMI(c_mat_est, [0], [1], [2])) sns.distplot(cmi_dep, hist=False, label="null_dep") sns.distplot(cmi_indep, hist=False, label="null_indep") indep_dep, estMI_dep, _, estPVal_dep, MIs_dep = knn._permutationTest( X[:, [0]], X[:, [2]], X[:, [1]]) MIdep_2 = [] for mi in MIs_dep: if mi < 0: MIdep_2.append(0) else: MIdep_2.append(mi) indep_indep, estMI_indep, _, estPVal_indep, MIs_indep = knn._permutationTest( X[:, [0]], X[:, [1]], X[:, [2]]) MIindep_2 = [] for mi in MIs_indep: if mi < 0: MIindep_2.append(0) else: MIindep_2.append(mi) #print("P-val from permutation test (dependent case): ",estPVal_dep) #p_null_dep = np.sum(np.array(cmi_dep) >= estMI_dep )/len(cmi_dep) p_null_indep = np.sum(np.array(cmi_indep) >= estMI_indep) / len(cmi_indep) #print("P-val from the null-distribution: ", p_null_dep) #print("------------------") print("P-val from permutation test (independent case): ", estPVal_indep) print("P-val from the null-distribution: ", p_null_indep) print(MIindep_2) sns.distplot(MIdep_2, hist=False, label="permutation_dep") sns.distplot(MIindep_2, hist=False, label="permutation_indep") plt.legend() return (cmi_dep, cmi_indep, true_cmi_dep)
def testCMI2(seed, k=3, tests=10): np.random.seed(seed) samples = [100, 25000] ic = np.array([[1.0, -0.2, 0], [-0.2, 1.0, 0.6], [0, 0.6, 1.0]]) c = np.linalg.inv(ic) c[0, 0] = 10 * c[0, 0] n = np.max(samples) nsamples = len(samples) res1 = np.zeros((2, nsamples)) res2 = np.zeros((2, nsamples)) res3 = np.zeros((2, nsamples)) knn = KnnEstimator(k=k) for tt in range(0, tests): data = np.random.multivariate_normal([0, 0, 0], c, n) jj = 0 for ss in samples: x, y, z = 0, 2, 1 cmixy_zT = mvnCMI(c, [x], [y], [z]) res1[0, jj] += (cmixy_zT - knn._cmi1( data[:ss, [x]], data[:ss, [y]], data[:ss, [z]])) / tests res1[1, jj] += (cmixy_zT - knn._cmi2( data[:ss, [x]], data[:ss, [y]], data[:ss, [z]])) / tests x, y, z = 1, 2, 0 cmixy_zT = mvnCMI(c, [x], [y], [z]) res2[0, jj] += (cmixy_zT - knn._cmi1( data[:ss, [x]], data[:ss, [y]], data[:ss, [z]])) / tests res2[1, jj] += (cmixy_zT - knn._cmi2( data[:ss, [x]], data[:ss, [y]], data[:ss, [z]])) / tests x, y, z = 0, 1, 2 cmixy_zT = mvnCMI(c, [x], [y], [z]) res3[0, jj] += (cmixy_zT - knn._cmi1( data[:ss, [x]], data[:ss, [y]], data[:ss, [z]])) / tests res3[1, jj] += (cmixy_zT - knn._cmi2( data[:ss, [x]], data[:ss, [y]], data[:ss, [z]])) / tests jj += 1 plt.figure() plt.xlim(0.9, len(samples) + 0.1) x = list(range(1, len(samples) + 1, 1)) for ii in range(0, 2): lab = "MI_" + str(ii) plt.plot(x, res1[ii, :], label=lab, marker='o') plt.xticks(x, samples) #plt.legend(loc='center left', bbox_to_anchor=(1, 0.5)) plt.legend(loc='best') plt.show() plt.figure() plt.xlim(0.9, len(samples) + 0.1) x = list(range(1, len(samples) + 1, 1)) for ii in range(0, 2): lab = "MI_" + str(ii) plt.plot(x, res2[ii, :], label=lab, marker='o') plt.xticks(x, samples) #plt.legend(loc='center left', bbox_to_anchor=(1, 0.5)) plt.legend(loc='best') plt.show() plt.figure() plt.xlim(0.9, len(samples) + 0.1) x = list(range(1, len(samples) + 1, 1)) for ii in range(0, 2): lab = "MI_" + str(ii) plt.plot(x, res3[ii, :], label=lab, marker='o') plt.xticks(x, samples) #plt.legend(loc='center left', bbox_to_anchor=(1, 0.5)) plt.legend(loc='best') plt.show()
def compareKnns(testName, seed = 123432, folderName = "knn_est_test", ntests = 25, ns = None, SAVE = True): if ns is None: ns = [125,250,500,1000, 2000] # different random number generators for data generation and for the knnMI method (permutation tests) rrData = np.random.RandomState(seed) rr1 = np.random.RandomState(seed + 1) # global rng is also used.. np.random.seed(seed) cores = mp.cpu_count() k_values = [0.01,0.1,0.2,3,5] local_perm = [True,False] graph_rules = ["AND","OR"] methods = list(product(k_values,local_perm)) method_names = [__method2str(method) for method in product(k_values,local_perm,graph_rules)] # initilize dictionaries for results res = {"HD" : [], "UG" : []} # measured quantities are keys Nres = {n : copy.deepcopy(res) for n in ns} allRes = {method : copy.deepcopy(Nres) for method in method_names} # used parameters parameters = {"seed": seed, "ntests": 0, "ns": ns, "testName" : testName, "methods" : method_names, "trueUGs" : []} # create folder where to save the resuls if folderName is None: directory = "tests" else: directory = "tests/" + folderName if not os.path.exists(directory): os.makedirs(directory) test_str = __test2str(testName) filename = directory + "/" + test_str + ".p" # create object for generating data and run the tests dd = DataGenerator(testName,rng = rrData) for tt in range(0,ntests): print("test ",tt + 1,"/", ntests, sep="") Xall,G = dd.createData(np.max(ns)) for n in ns: X = Xall[:n,:] X = scale(X) # zero mean, sd one for all the features print("............sample size: ",n) for method in methods: kk,local = method if kk < 1: k = max(3,int(np.ceil(kk*n))) else: k = kk if local: k_perm = 5 else: k_perm = None knnest = KnnEstimator(k = k,k_perm = k_perm, rng = rr1, parallel = cores) knn_sl = StructLearn(X, ci_estimator= knnest) knn_sl.findMoralGraph() for graph_rule in graph_rules: est_ug = knn_sl.getMoralGraph(graph_rule) method_name = __method2str( (method[0],method[1],graph_rule) ) # compute Hamming distance hd = HD(G,est_ug) # save stuff print(method_name,hd) allRes[method_name][n]["HD"].append(hd) allRes[method_name][n]["UG"].append(est_ug) # save the true UG (this is differs between the tests only in random graph cases) parameters["trueUGs"].append(G) # save results after every 5 tests if (tt + 1) % 5 == 0 and SAVE: parameters["ntests"] = tt + 1 res = (allRes,parameters) saveResults(res,filename) # final results parameters["ntests"] = tt + 1 res = (allRes,parameters) if SAVE: saveResults(res,filename) return res
def doTests(testName, folderName = None, seed = 123456, ntests = 25, ns = None, k = 3, k_perm = None, methods = None, lambdaRatio = 0.01, useTransformation = False, SAVE = True): if methods is None: methods = ["knnMI_AND", "knnMI_OR", "fisherZ_AND", "fisherZ_OR", "mb_RIC", "glasso_RIC", "mb_STARS", "glasso_STARS", "mb_auto"] if ns is None: ns = [125,250,500,1000,2000] # different random number generators for data generation and for the knnMI method (permutation tests) rrData = np.random.RandomState(seed) rr1 = np.random.RandomState(seed + 1) # global rng is also used.. np.random.seed(seed) cores = mp.cpu_count() # conditional independence tests knnEst1 = KnnEstimator(k = k,rng = rr1, parallel= cores,k_perm = k_perm) fEst = FisherCI() if "KCIT_OR" in methods or "KCIT_AND" in methods: k_cit = KCIT(seed = seed + 2) if "RCIT_OR" in methods or "RCIT_AND" in methods: r_cit = RCIT(seed = seed + 3) # initilize dictionaries for results res = {"HD" : [], "UG" : [], "sparsity" : []} # measured quantities are keys Nres = {n : copy.deepcopy(res) for n in ns} allRes = {method : copy.deepcopy(Nres) for method in methods} # used parameters parameters = {"seed": seed, "ntests": 0, "ns": ns, "testName" : testName, "methods" : methods, "k" : k, "lambdaRatio" : lambdaRatio, "trueUGs" : []} # create folder where to save the resuls if folderName is None: directory = "tests" else: directory = "tests/" + folderName if not os.path.exists(directory): os.makedirs(directory) test_str = __test2str(testName) filename = directory + "/" + test_str + ".p" # create object for generating data and run the tests dd = DataGenerator(testName,rng = rrData) nonPara = True # use non-paranormal transformation for glasso and mb #DEBUG errorCount = 0 for tt in range(0,ntests): print("test ",tt + 1,"/", ntests, sep="") Xall,G = dd.createData(np.max(ns)) for n in ns: X = Xall[:n,:] X = scale(X) # zero mean, sd one for all the features if useTransformation: X = transform(X) # non-paranormal transformation for every method print("Transformation used.") nonPara = False # no need to perform the transformation twice when glasso/mb is called print("............sample size: ",n) # kernel methods if "KCIT_OR" in methods or "KCIT_AND" in methods: kcitSl = StructLearn(X,ci_estimator = k_cit) kcitSl.findMoralGraph() if "RCIT_OR" in methods or "RCIT_AND" in methods: rcitSl = StructLearn(X,ci_estimator = r_cit) rcitSl.findMoralGraph() # find Markov blankets for knnMI method if "knnMI_AND" in methods or "knnMI_OR" in methods: if k < 1: knnEst1.k = max(3,int(np.ceil(k*n))) knnSl = StructLearn(X, ci_estimator= knnEst1) knnSl.findMoralGraph() # same for fisherZ based method if "fisherZ_AND" in methods or "fisherZ" in methods: fishSl = StructLearn(X, ci_estimator= fEst) fishSl.findMoralGraph() for method in methods: sp = np.nan # record sparsities of estimated graphs for glasso/mb, for other methods use just nan (graphs are saved so sparsity is easy to compute) # DEBUG seeeds = np.random.RandomState if method == "knnMI_AND": estUG = knnSl.getMoralGraph("AND") elif method == "knnMI_OR": estUG = knnSl.getMoralGraph("OR") elif method == "fisherZ_AND": estUG = fishSl.getMoralGraph("AND") elif method == "fisherZ_OR": estUG = fishSl.getMoralGraph("OR") elif method == "glasso_RIC": estUG, sp = hugeLearnGraph(X,method = "glasso", modelSelectCrit= "ric", nonPara=nonPara, lambdaRatio= lambdaRatio) elif method == "glasso_BIC": estUG, sp = hugeLearnGraph(X,method = "glasso", modelSelectCrit= "ebic", nonPara=nonPara, ebicTuning= 0.0,lambdaRatio= lambdaRatio) elif method == "glasso_EBIC": estUG, sp = hugeLearnGraph(X,method = "glasso", modelSelectCrit= "ebic", nonPara=nonPara, ebicTuning= 0.5,lambdaRatio= lambdaRatio) elif method == "mb_RIC": estUG,sp = hugeLearnGraph(X,method = "mb", modelSelectCrit= "ric", nonPara=nonPara,lambdaRatio= lambdaRatio) elif method == "mb_auto": estUG,sp = hugeLearnGraph(X,method = "mb", modelSelectCrit= "mbDefault", nonPara=nonPara) elif method == "mb_STARS": estUG,sp = hugeLearnGraph(X,method = "mb", modelSelectCrit= "stars", nonPara=nonPara,lambdaRatio= lambdaRatio) elif method == "glasso_STARS": estUG,sp = hugeLearnGraph(X,method = "glasso", modelSelectCrit= "stars", nonPara=nonPara,lambdaRatio= lambdaRatio) elif method == "KCIT_AND": estUG = kcitSl.getMoralGraph("AND") elif method == "KCIT_OR": estUG = kcitSl.getMoralGraph("OR") elif method == "RCIT_AND": estUG = rcitSl.getMoralGraph("AND") elif method == "RCIT_OR": estUG = rcitSl.getMoralGraph("OR") else: print("unspecified method!!") hd = np.nan # DEBUG if (estUG == estUG.T).all() == False: errors = {"testName" : testName, "data": X, "method" : method, "currentSeed" : seeeds, "estUG": estUG, "trueUG": G, "testNumber" : tt +1 } errorCount += 1 path = directory + "/errors_" + test_str + "_" + str(errorCount) + ".p" saveResults(errors,path) ## force symmetry on UG estUG = 1*(estUG + estUG.T == 2) # compute Hamming distance hd = HD(G,estUG) # save stuff print(method,hd) allRes[method][n]["HD"].append(hd) allRes[method][n]["UG"].append(estUG) allRes[method][n]["sparsity"].append(sp) # save the true UG (this is differs between the tests only in random graph cases) parameters["trueUGs"].append(G) # save results after every 5 tests if (tt + 1) % 5 == 0 and SAVE: parameters["ntests"] = tt + 1 res = (allRes,parameters) saveResults(res,filename) # final results parameters["ntests"] = tt + 1 res = (allRes,parameters) if SAVE: saveResults(res,filename) return res